diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 09:19:14 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 09:19:14 -0700 |
commit | 513389809e138ae903b6ef43c1d5d2ffaf4dca17 (patch) | |
tree | c71e478fab1568da4706868b14eb67a75c148a8b /block | |
parent | 0a78a376ef3c2f3d397df48909f00cd75f92137a (diff) | |
parent | 30514bd2dd4e86a3ecfd6a93a3eadf7b9ea164a0 (diff) | |
download | linux-513389809e138ae903b6ef43c1d5d2ffaf4dca17.tar.gz linux-513389809e138ae903b6ef43c1d5d2ffaf4dca17.tar.bz2 linux-513389809e138ae903b6ef43c1d5d2ffaf4dca17.zip |
Merge tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- NVMe pull requests via Christoph:
- handle number of queue changes in the TCP and RDMA drivers
(Daniel Wagner)
- allow changing the number of queues in nvmet (Daniel Wagner)
- also consider host_iface when checking ip options (Daniel
Wagner)
- don't map pages which can't come from HIGHMEM (Fabio M. De
Francesco)
- avoid unnecessary flush bios in nvmet (Guixin Liu)
- shrink and better pack the nvme_iod structure (Keith Busch)
- add comment for unaligned "fake" nqn (Linjun Bao)
- print actual source IP address through sysfs "address" attr
(Martin Belanger)
- various cleanups (Jackie Liu, Wolfram Sang, Genjian Zhang)
- handle effects after freeing the request (Keith Busch)
- copy firmware_rev on each init (Keith Busch)
- restrict management ioctls to admin (Keith Busch)
- ensure subsystem reset is single threaded (Keith Busch)
- report the actual number of tagset maps in nvme-pci (Keith
Busch)
- small fabrics authentication fixups (Christoph Hellwig)
- add common code for tagset allocation and freeing (Christoph
Hellwig)
- stop using the request_queue in nvmet (Christoph Hellwig)
- set min_align_mask before calculating max_hw_sectors (Rishabh
Bhatnagar)
- send a rediscover uevent when a persistent discovery controller
reconnects (Sagi Grimberg)
- misc nvmet-tcp fixes (Varun Prakash, zhenwei pi)
- MD pull request via Song:
- Various raid5 fix and clean up, by Logan Gunthorpe and David
Sloan.
- Raid10 performance optimization, by Yu Kuai.
- sbitmap wakeup hang fixes (Hugh, Keith, Jan, Yu)
- IO scheduler switching quisce fix (Keith)
- s390/dasd block driver updates (Stefan)
- support for recovery for the ublk driver (ZiyangZhang)
- rnbd drivers fixes and updates (Guoqing, Santosh, ye, Christoph)
- blk-mq and null_blk map fixes (Bart)
- various bcache fixes (Coly, Jilin, Jules)
- nbd signal hang fix (Shigeru)
- block writeback throttling fix (Yu)
- optimize the passthrough mapping handling (me)
- prepare block cgroups to being gendisk based (Christoph)
- get rid of an old PSI hack in the block layer, moving it to the
callers instead where it belongs (Christoph)
- blk-throttle fixes and cleanups (Yu)
- misc fixes and cleanups (Liu Shixin, Liu Song, Miaohe, Pankaj,
Ping-Xiang, Wolfram, Saurabh, Li Jinlin, Li Lei, Lin, Li zeming,
Miaohe, Bart, Coly, Gaosheng
* tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux: (162 commits)
sbitmap: fix lockup while swapping
block: add rationale for not using blk_mq_plug() when applicable
block: adapt blk_mq_plug() to not plug for writes that require a zone lock
s390/dasd: use blk_mq_alloc_disk
blk-cgroup: don't update the blkg lookup hint in blkg_conf_prep
nvmet: don't look at the request_queue in nvmet_bdev_set_limits
nvmet: don't look at the request_queue in nvmet_bdev_zone_mgmt_emulate_all
blk-mq: use quiesced elevator switch when reinitializing queues
block: replace blk_queue_nowait with bdev_nowait
nvme: remove nvme_ctrl_init_connect_q
nvme-loop: use the tagset alloc/free helpers
nvme-loop: store the generic nvme_ctrl in set->driver_data
nvme-loop: initialize sqsize later
nvme-fc: use the tagset alloc/free helpers
nvme-fc: store the generic nvme_ctrl in set->driver_data
nvme-fc: keep ctrl->sqsize in sync with opts->queue_size
nvme-rdma: use the tagset alloc/free helpers
nvme-rdma: store the generic nvme_ctrl in set->driver_data
nvme-tcp: use the tagset alloc/free helpers
nvme-tcp: store the generic nvme_ctrl in set->driver_data
...
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-cgroup.c | 5 | ||||
-rw-r--r-- | block/bfq-iosched.c | 14 | ||||
-rw-r--r-- | block/bfq-iosched.h | 18 | ||||
-rw-r--r-- | block/bfq-wf2q.c | 9 | ||||
-rw-r--r-- | block/bio.c | 13 | ||||
-rw-r--r-- | block/blk-cgroup.c | 183 | ||||
-rw-r--r-- | block/blk-cgroup.h | 68 | ||||
-rw-r--r-- | block/blk-core.c | 37 | ||||
-rw-r--r-- | block/blk-iocost.c | 39 | ||||
-rw-r--r-- | block/blk-iolatency.c | 5 | ||||
-rw-r--r-- | block/blk-ioprio.c | 8 | ||||
-rw-r--r-- | block/blk-ioprio.h | 8 | ||||
-rw-r--r-- | block/blk-map.c | 52 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 4 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 2 | ||||
-rw-r--r-- | block/blk-mq-pci.c | 7 | ||||
-rw-r--r-- | block/blk-mq-rdma.c | 6 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 2 | ||||
-rw-r--r-- | block/blk-mq-virtio.c | 7 | ||||
-rw-r--r-- | block/blk-mq.c | 32 | ||||
-rw-r--r-- | block/blk-mq.h | 3 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 1 | ||||
-rw-r--r-- | block/blk-sysfs.c | 2 | ||||
-rw-r--r-- | block/blk-throttle.c | 280 | ||||
-rw-r--r-- | block/blk-throttle.h | 53 | ||||
-rw-r--r-- | block/blk-wbt.c | 9 | ||||
-rw-r--r-- | block/blk-zoned.c | 9 | ||||
-rw-r--r-- | block/blk.h | 7 | ||||
-rw-r--r-- | block/elevator.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 7 | ||||
-rw-r--r-- | block/opal_proto.h | 5 | ||||
-rw-r--r-- | block/sed-opal.c | 89 |
32 files changed, 532 insertions, 456 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 30b15a9a47c4..144bca006463 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -254,17 +254,12 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, #else /* CONFIG_BFQ_CGROUP_DEBUG */ -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - blk_opf_t opf) { } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c740b41fe0a4..7ea427817f7f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1925,7 +1925,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfqq->service_from_backlogged = 0; bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); + bfq_add_bfqq_busy(bfqq); /* * Expire in-service queue if preemption may be needed for @@ -2419,7 +2419,7 @@ static void bfq_remove_request(struct request_queue *q, bfqq->next_rq = NULL; if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); /* * bfqq emptied. In normal operation, when * bfqq is empty, bfqq->entity.service and @@ -3098,7 +3098,7 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) */ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq != bfqd->in_service_queue) - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); bfq_reassign_last_bfqq(bfqq, NULL); @@ -3908,7 +3908,7 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, true); + bfq_del_bfqq_busy(bfqq, true); } else { bfq_requeue_bfqq(bfqd, bfqq, true); /* @@ -5255,9 +5255,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) struct hlist_node *n; struct bfq_group *bfqg = bfqq_group(bfqq); - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", - bfqq, bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); bfqq->ref--; if (bfqq->ref) @@ -5321,7 +5319,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_del_init(&item->woken_list_node); } - if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) + if (bfqq->bfqd->last_completed_rq_bfqq == bfqq) bfqq->bfqd->last_completed_rq_bfqq = NULL; kmem_cache_free(bfq_pool, bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ad8e513d7e87..64ee618064ba 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -993,20 +993,23 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - blk_opf_t opf); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf); void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -void bfqg_stats_update_idle_time(struct bfq_group *bfqg); void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_CGROUP_DEBUG +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + blk_opf_t opf); +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); +void bfqg_stats_update_idle_time(struct bfq_group *bfqg); +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); +#endif + void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); @@ -1077,9 +1080,8 @@ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration); -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); +void bfq_add_bfqq_busy(struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 983413cdefad..8fc3da4c23bb 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1651,9 +1651,10 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, * the service tree. As a special case, it can be invoked during an * expiration. */ -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration) +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "del from busy"); bfq_clear_bfqq_busy(bfqq); @@ -1674,8 +1675,10 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* * Called when an inactive queue receives a new request. */ -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +void bfq_add_bfqq_busy(struct bfq_queue *bfqq) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "add to busy"); bfq_activate_bfqq(bfqd, bfqq); diff --git a/block/bio.c b/block/bio.c index 3d3a2678fea2..7cb7d2ff139b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -760,8 +760,6 @@ EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); - if (bio_flagged(bio_src, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; @@ -1065,9 +1063,6 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1276,9 +1271,6 @@ out: * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1294,8 +1286,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -1754,7 +1744,8 @@ static int __init init_bio(void) cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); - if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, + BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 869af9d72bcf..6a5c849ee061 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -202,19 +202,19 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with - * @q: request_queue the new blkg is associated with + * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg assocating @blkcg and @q. */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; @@ -225,10 +225,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg->iostat_cpu) goto err_free; - if (!blk_get_queue(q)) + if (!blk_get_queue(disk->queue)) goto err_free; - blkg->q = q; + blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); @@ -243,11 +243,11 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; - if (!blkcg_policy_enabled(q, pol)) + if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); + pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg); if (!pd) goto err_free; @@ -263,45 +263,20 @@ err_free: return NULL; } -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint) -{ - struct blkcg_gq *blkg; - - /* - * Hint didn't match. Look up from the radix tree. Note that the - * hint can only be updated under queue_lock as otherwise @blkg - * could have already been removed from blkg_tree. The caller is - * responsible for grabbing queue_lock if @update_hint. - */ - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) { - if (update_hint) { - lockdep_assert_held(&q->queue_lock); - rcu_assign_pointer(blkcg->blkg_hint, blkg); - } - return blkg; - } - - return NULL; -} -EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); - /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ -static struct blkcg_gq *blkg_create(struct blkcg *blkcg, - struct request_queue *q, +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; - lockdep_assert_held(&q->queue_lock); + lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ - if (blk_queue_dying(q)) { + if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } @@ -314,7 +289,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; @@ -324,7 +299,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* link parent */ if (blkcg_parent(blkcg)) { - blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; @@ -342,10 +317,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* insert */ spin_lock(&blkcg->lock); - ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &q->blkg_list); + list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -374,19 +349,20 @@ err_free_blkg: /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest - * @q: request_queue of interest + * @disk: gendisk of interest * - * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and takes @q->queue_lock. + * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) + struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; @@ -397,9 +373,13 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup(blkcg, q, true); - if (blkg) + blkg = blkg_lookup(blkcg, q); + if (blkg) { + if (blkcg != &blkcg_root && + blkg != rcu_dereference(blkcg->blkg_hint)) + rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; + } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all @@ -412,7 +392,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { - blkg = __blkg_lookup(parent, q, false); + blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; @@ -422,7 +402,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, parent = blkcg_parent(parent); } - blkg = blkg_create(pos, q, NULL); + blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; @@ -476,14 +456,9 @@ static void blkg_destroy(struct blkcg_gq *blkg) percpu_ref_kill(&blkg->refcnt); } -/** - * blkg_destroy_all - destroy all blkgs associated with a request_queue - * @q: request_queue of interest - * - * Destroy all blkgs associated with @q. - */ -static void blkg_destroy_all(struct request_queue *q) +static void blkg_destroy_all(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg, *n; int count = BLKG_DESTROY_BATCH_SIZE; @@ -616,19 +591,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/* Performs queue bypass and policy enabled checks then looks up blkg. */ -static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, - const struct blkcg_policy *pol, - struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) - return ERR_PTR(-EOPNOTSUPP); - return __blkg_lookup(blkcg, q, true /* update_hint */); -} - /** * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer @@ -684,6 +646,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) { struct block_device *bdev; + struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; @@ -691,8 +654,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, bdev = blkcg_conf_open_bdev(&input); if (IS_ERR(bdev)) return PTR_ERR(bdev); - - q = bdev_get_queue(bdev); + disk = bdev->bd_disk; + q = disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab @@ -705,12 +668,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(blkcg, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { + ret = -EOPNOTSUPP; goto fail_unlock; } + blkg = blkg_lookup(blkcg, q); if (blkg) goto success; @@ -724,7 +687,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); - while (parent && !__blkg_lookup(parent, q, false)) { + while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } @@ -733,7 +696,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); - new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; @@ -748,17 +711,17 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(pos, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); + ret = -EOPNOTSUPP; goto fail_preloaded; } + blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { - blkg = blkg_create(pos, q, new_blkg); + blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; @@ -915,8 +878,7 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); - struct blkcg_gq *blkg = - blk_queue_root_blkg(bdev_get_queue(bdev)); + struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; @@ -1255,25 +1217,16 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } -/** - * blkcg_init_queue - initialize blkcg part of request queue - * @q: request_queue to initialize - * - * Called from blk_alloc_queue(). Responsible for initializing blkcg - * part of new request_queue @q. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int blkcg_init_queue(struct request_queue *q) +int blkcg_init_disk(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; int ret; INIT_LIST_HEAD(&q->blkg_list); - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; @@ -1282,7 +1235,7 @@ int blkcg_init_queue(struct request_queue *q) /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); - blkg = blkg_create(&blkcg_root, q, new_blkg); + blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; @@ -1291,25 +1244,26 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); - ret = blk_ioprio_init(q); + ret = blk_ioprio_init(disk); if (ret) goto err_destroy_all; - ret = blk_throtl_init(q); + ret = blk_throtl_init(disk); if (ret) - goto err_destroy_all; + goto err_ioprio_exit; - ret = blk_iolatency_init(q); - if (ret) { - blk_throtl_exit(q); - blk_ioprio_exit(q); - goto err_destroy_all; - } + ret = blk_iolatency_init(disk); + if (ret) + goto err_throtl_exit; return 0; +err_throtl_exit: + blk_throtl_exit(disk); +err_ioprio_exit: + blk_ioprio_exit(disk); err_destroy_all: - blkg_destroy_all(q); + blkg_destroy_all(disk); return ret; err_unlock: spin_unlock_irq(&q->queue_lock); @@ -1318,16 +1272,10 @@ err_unlock: return PTR_ERR(blkg); } -/** - * blkcg_exit_queue - exit and release blkcg part of request_queue - * @q: request_queue being released - * - * Called from blk_exit_queue(). Responsible for exiting blkcg part. - */ -void blkcg_exit_queue(struct request_queue *q) +void blkcg_exit_disk(struct gendisk *disk) { - blkg_destroy_all(q); - blk_throtl_exit(q); + blkg_destroy_all(disk); + blk_throtl_exit(disk); } static void blkcg_bind(struct cgroup_subsys_state *root_css) @@ -1836,13 +1784,13 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q: the request queue IO was submitted on + * @gendisk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for - * instance will only have a request_queue at that point. This set's the + * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * @@ -1851,8 +1799,10 @@ out: * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { + struct request_queue *q = disk->queue; + if (unlikely(current->flags & PF_KTHREAD)) return; @@ -1902,8 +1852,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), - bdev_get_queue(bio->bi_bdev)); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index d2724d1dd7c9..aa2b286bc825 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -178,10 +178,8 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint); -int blkcg_init_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); +int blkcg_init_disk(struct gendisk *disk); +void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); @@ -227,22 +225,21 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio) } /** - * __blkg_lookup - internal version of blkg_lookup() + * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. + * Lookup blkg for the @blkcg - @q pair. + + * Must be called in a RCU critical section. */ -static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q, - bool update_hint) +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; + WARN_ON_ONCE(!rcu_read_lock_held()); + if (blkcg == &blkcg_root) return q->root_blkg; @@ -250,33 +247,10 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, if (blkg && blkg->q == q) return blkg; - return blkg_lookup_slowpath(blkcg, q, update_hint); -} - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock. - */ -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __blkg_lookup(blkcg, q, false); -} - -/** - * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair - * @q: request_queue of interest - * - * Lookup blkg for @q at the root level. See also blkg_lookup(). - */ -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ - return q->root_blkg; + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q != q) + blkg = NULL; + return blkg; } /** @@ -373,8 +347,8 @@ static inline void blkg_put(struct blkcg_gq *blkg) */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants @@ -388,8 +362,8 @@ static inline void blkg_put(struct blkcg_gq *blkg) */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) bool __blkcg_punt_bio_submit(struct bio *bio); @@ -507,10 +481,8 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } +static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct request_queue *q, diff --git a/block/blk-core.c b/block/blk-core.c index 651057c4146b..17667159482e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -37,7 +37,6 @@ #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> -#include <linux/psi.h> #include <linux/part_stat.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> @@ -487,18 +486,15 @@ static int __init fail_make_request_debugfs(void) late_initcall(fail_make_request_debugfs); #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio) +static inline void bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) - return false; + return; pr_warn("Trying to write to read-only block-device %pg\n", bio->bi_bdev); /* Older lvm-tools actually trigger this */ - return false; } - - return false; } static noinline int should_fail_bio(struct bio *bio) @@ -717,13 +713,12 @@ void submit_bio_noacct(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ - if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) goto end_io; - if (unlikely(bio_check_ro(bio))) - goto end_io; + bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; @@ -814,7 +809,7 @@ EXPORT_SYMBOL(submit_bio_noacct); * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback - * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has + * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has * been called. */ void submit_bio(struct bio *bio) @@ -829,22 +824,6 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } - /* - * If we're reading data that is part of the userspace workingset, count - * submission time as memory stall. When the device is congested, or - * the submitting cgroup IO-throttled, submission can be a significant - * part of overall IO time. - */ - if (unlikely(bio_op(bio) == REQ_OP_READ && - bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; - - psi_memstall_enter(&pflags); - submit_bio_noacct(bio); - psi_memstall_leave(&pflags); - return; - } - submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); @@ -871,6 +850,12 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; + /* + * As the requests that require a zone lock are not plugged in the + * first place, directly accessing the plug instead of using + * blk_mq_plug() should not have any consequences during flushing for + * zoned devices. + */ blk_flush_plug(current->plug, false); if (bio_queue_enter(bio)) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7936e5f5821c..495396425bad 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -664,17 +664,13 @@ static struct ioc *q_to_ioc(struct request_queue *q) return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } -static const char *q_name(struct request_queue *q) -{ - if (blk_queue_registered(q)) - return kobject_name(q->kobj.parent); - else - return "<unknown>"; -} - static const char __maybe_unused *ioc_name(struct ioc *ioc) { - return q_name(ioc->rqos.q); + struct gendisk *disk = ioc->rqos.q->disk; + + if (!disk) + return "<unknown>"; + return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) @@ -1430,7 +1426,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); - struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; @@ -2640,7 +2636,7 @@ retry_lock: if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; @@ -2741,7 +2737,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); @@ -2832,8 +2828,9 @@ static struct rq_qos_ops ioc_rqos_ops = { .exit = ioc_rqos_exit, }; -static int blk_iocost_init(struct request_queue *q) +static int blk_iocost_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct ioc *ioc; struct rq_qos *rqos; int i, cpu, ret; @@ -3170,6 +3167,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct block_device *bdev; + struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; @@ -3180,12 +3178,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (IS_ERR(bdev)) return PTR_ERR(bdev); - ioc = q_to_ioc(bdev_get_queue(bdev)); + disk = bdev->bd_disk; + ioc = q_to_ioc(disk->queue); if (!ioc) { - ret = blk_iocost_init(bdev_get_queue(bdev)); + ret = blk_iocost_init(disk); if (ret) goto err; - ioc = q_to_ioc(bdev_get_queue(bdev)); + ioc = q_to_ioc(disk->queue); } spin_lock_irq(&ioc->lock); @@ -3262,11 +3261,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, spin_lock_irq(&ioc->lock); if (enable) { - blk_stat_enable_accounting(ioc->rqos.q); - blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + blk_stat_enable_accounting(disk->queue); + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; } else { - blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } @@ -3349,7 +3348,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(bdev_get_queue(bdev)); if (!ioc) { - ret = blk_iocost_init(bdev_get_queue(bdev)); + ret = blk_iocost_init(bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(bdev_get_queue(bdev)); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index e285152345a2..571fa95aafe9 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) - blkcg_schedule_throttle(rqos->q, use_memdelay); + blkcg_schedule_throttle(rqos->q->disk, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are @@ -756,8 +756,9 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) } } -int blk_iolatency_init(struct request_queue *q) +int blk_iolatency_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_iolatency *blkiolat; struct rq_qos *rqos; int ret; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index c00060a02c6e..8bb6b8eba4ce 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -202,14 +202,14 @@ void blkcg_set_ioprio(struct bio *bio) bio->bi_ioprio = prio; } -void blk_ioprio_exit(struct request_queue *q) +void blk_ioprio_exit(struct gendisk *disk) { - blkcg_deactivate_policy(q, &ioprio_policy); + blkcg_deactivate_policy(disk->queue, &ioprio_policy); } -int blk_ioprio_init(struct request_queue *q) +int blk_ioprio_init(struct gendisk *disk) { - return blkcg_activate_policy(q, &ioprio_policy); + return blkcg_activate_policy(disk->queue, &ioprio_policy); } static int __init ioprio_init(void) diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index 5a1eb550e178..b6afb8e80de0 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -9,15 +9,15 @@ struct request_queue; struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO -int blk_ioprio_init(struct request_queue *q); -void blk_ioprio_exit(struct request_queue *q); +int blk_ioprio_init(struct gendisk *disk); +void blk_ioprio_exit(struct gendisk *disk); void blkcg_set_ioprio(struct bio *bio); #else -static inline int blk_ioprio_init(struct request_queue *q) +static inline int blk_ioprio_init(struct gendisk *disk) { return 0; } -static inline void blk_ioprio_exit(struct request_queue *q) +static inline void blk_ioprio_exit(struct gendisk *disk) { } static inline void blkcg_set_ioprio(struct bio *bio) diff --git a/block/blk-map.c b/block/blk-map.c index 7196a6b64c80..7693f8e3c454 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -158,7 +158,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { - nr_pages = 1 << map_data->page_order; + nr_pages = 1U << map_data->page_order; i = map_data->offset / PAGE_SIZE; } while (len) { @@ -231,6 +231,16 @@ out_bmd: return ret; } +static void bio_map_put(struct bio *bio) +{ + if (bio->bi_opf & REQ_ALLOC_CACHE) { + bio_put(bio); + } else { + bio_uninit(bio); + kfree(bio); + } +} + static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { @@ -243,18 +253,34 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(nr_vecs, gfp_mask); - if (!bio) - return -ENOMEM; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + if (rq->cmd_flags & REQ_POLLED) { + blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, + &fs_bio_set); + if (!bio) + return -ENOMEM; + } else { + bio = bio_kmalloc(nr_vecs, gfp_mask); + if (!bio) + return -ENOMEM; + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + } while (iov_iter_count(iter)) { - struct page **pages; + struct page **pages, *stack_pages[UIO_FASTIOV]; ssize_t bytes; - size_t offs, added = 0; + size_t offs; int npages; - bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); + if (nr_vecs <= ARRAY_SIZE(stack_pages)) { + pages = stack_pages; + bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, + nr_vecs, &offs); + } else { + bytes = iov_iter_get_pages_alloc2(iter, &pages, + LONG_MAX, &offs); + } if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -280,7 +306,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, break; } - added += n; bytes -= n; offs = 0; } @@ -290,7 +315,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, */ while (j < npages) put_page(pages[j++]); - kvfree(pages); + if (pages != stack_pages) + kvfree(pages); /* couldn't stuff something into bio? */ if (bytes) { iov_iter_revert(iter, bytes); @@ -305,8 +331,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_uninit(bio); - kfree(bio); + bio_map_put(bio); return ret; } @@ -611,8 +636,7 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_uninit(next_bio); - kfree(next_bio); + bio_map_put(next_bio); } return ret; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 3db84d3197f1..9c2fce1a7b50 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -32,7 +32,7 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_map_queues(struct blk_mq_queue_map *qmap) +void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { unsigned int *map = qmap->mq_map; unsigned int nr_queues = qmap->nr_queues; @@ -70,8 +70,6 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap) map[cpu] = map[first_sibling]; } } - - return 0; } EXPORT_SYMBOL_GPL(blk_mq_map_queues); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index dee789f2f98f..bd942341b638 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -807,8 +807,6 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; - case RQ_QOS_IOPRIO: - return "ioprio"; } return "unknown"; } diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index b595a94c4d16..a90b88fd1332 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -23,8 +23,8 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset) +void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset) { const struct cpumask *mask; unsigned int queue, cpu; @@ -38,11 +38,10 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; fallback: WARN_ON_ONCE(qmap->nr_queues > 1); blk_mq_clear_mq_map(qmap); - return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c index 14f968e58b8f..29c1f4d6eb04 100644 --- a/block/blk-mq-rdma.c +++ b/block/blk-mq-rdma.c @@ -21,7 +21,7 @@ * @set->nr_hw_queues, or @dev does not provide an affinity mask for a * vector, we fallback to the naive mapping. */ -int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, +void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec) { const struct cpumask *mask; @@ -36,9 +36,9 @@ int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, map->mq_map[cpu] = map->queue_offset + queue; } - return 0; + return; fallback: - return blk_mq_map_queues(map); + blk_mq_map_queues(map); } EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8e3b36d1cb57..9eb968e14d31 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * other allocations on previous queue won't be starved. */ if (bt != bt_prev) - sbitmap_queue_wake_up(bt_prev); + sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index 7b8a42c35102..6589f076a096 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -21,7 +21,7 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, +void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec) { const struct cpumask *mask; @@ -39,8 +39,9 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; + fallback: - return blk_mq_map_queues(qmap); + blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 0df20184cc3e..83492d942348 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1093,10 +1093,12 @@ bool blk_mq_complete_request_remote(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* - * For a polled request, always complete locally, it's pointless - * to redirect the completion. + * For request which hctx has only one ctx mapping, + * or a polled request, always complete locally, + * it's pointless to redirect the completion. */ - if (rq->cmd_flags & REQ_POLLED) + if (rq->mq_hctx->nr_ctx == 1 || + rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { @@ -1213,6 +1215,12 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); + + /* + * As plugging can be enabled for passthrough requests on a zoned + * device, directly accessing the plug instead of using blk_mq_plug() + * should not have any consequences. + */ if (current->plug) blk_add_rq_to_plug(current->plug, rq); else @@ -1993,7 +2001,7 @@ out: if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && needs_resource) + else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); @@ -4192,7 +4200,7 @@ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) return 0; } -static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) +static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations @@ -4222,10 +4230,10 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); - return set->ops->map_queues(set); + set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); - return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } @@ -4324,9 +4332,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; } - ret = blk_mq_update_queue_map(set); - if (ret) - goto out_free_mq_map; + blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) @@ -4474,14 +4480,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head, list_add(&qe->node, head); /* - * After elevator_switch_mq, the previous elevator_queue will be + * After elevator_switch, the previous elevator_queue will be * released by elevator_release. The reference of the io scheduler * module get by elevator_get will also be put. So we need to get * a reference of the io scheduler module here to prevent it to be * removed. */ __module_get(qe->type->elevator_owner); - elevator_switch_mq(q, NULL); + elevator_switch(q, NULL); mutex_unlock(&q->sysfs_lock); return true; @@ -4513,7 +4519,7 @@ static void blk_mq_elv_switch_back(struct list_head *head, kfree(qe); mutex_lock(&q->sysfs_lock); - elevator_switch_mq(q, t); + elevator_switch(q, t); mutex_unlock(&q->sysfs_lock); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 8ca453ac243d..0b2870839cdd 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -312,7 +312,8 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) static inline struct blk_plug *blk_mq_plug( struct bio *bio) { /* Zoned block device write operation case: do not plug the BIO */ - if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio))) + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) return NULL; /* diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 08b856570ad1..1ef1f7d4bc3c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,7 +17,6 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, - RQ_QOS_IOPRIO, }; struct rq_wait { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e1f009aba6fd..e71b3b43927c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -844,7 +844,7 @@ int blk_register_queue(struct gendisk *disk) blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); - blk_throtl_register_queue(q); + blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&q->kobj, KOBJ_ADD); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9f5fe62afff9..847721dc2b2b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -329,8 +329,8 @@ static struct bio *throtl_pop_queued(struct list_head *queued, /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { - INIT_LIST_HEAD(&sq->queued[0]); - INIT_LIST_HEAD(&sq->queued[1]); + INIT_LIST_HEAD(&sq->queued[READ]); + INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } @@ -420,24 +420,17 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_data *td = tg->td; int rw; - int has_iops_limit = 0; for (rw = READ; rw <= WRITE; rw++) { - unsigned int iops_limit = tg_iops_limit(tg, rw); - - tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || + tg->has_rules_iops[rw] = + (parent_tg && parent_tg->has_rules_iops[rw]) || (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX || - iops_limit != UINT_MAX)); - - if (iops_limit != UINT_MAX) - has_iops_limit = 1; + tg_iops_limit(tg, rw) != UINT_MAX); + tg->has_rules_bps[rw] = + (parent_tg && parent_tg->has_rules_bps[rw]) || + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX)); } - - if (has_iops_limit) - tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; - else - tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; } static void throtl_pd_online(struct blkg_policy_data *pd) @@ -520,7 +513,6 @@ static void throtl_rb_erase(struct rb_node *n, { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); - --parent_sq->nr_pending; } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) @@ -572,7 +564,11 @@ static void throtl_enqueue_tg(struct throtl_grp *tg) static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { - throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); + struct throtl_service_queue *parent_sq = + tg->service_queue.parent_sq; + + throtl_rb_erase(&tg->rb_node, parent_sq); + --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } @@ -639,6 +635,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -656,12 +654,17 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->slice_end[rw], jiffies); } -static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, + bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + if (clear_carryover) { + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; + } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -754,33 +757,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw], tg->slice_end[rw], jiffies); } -static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, - u32 iops_limit, unsigned long *wait) +static unsigned int calculate_io_allowed(u32 iops_limit, + unsigned long jiffy_elapsed) { - bool rw = bio_data_dir(bio); unsigned int io_allowed; - unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; - if (iops_limit == UINT_MAX) { - if (wait) - *wait = 0; - return true; - } - - jiffy_elapsed = jiffies - tg->slice_start[rw]; - - /* Round up to the next throttle slice, wait time must be nonzero */ - jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - /* - * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ - tmp = (u64)iops_limit * jiffy_elapsed_rnd; + tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -788,6 +778,68 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, else io_allowed = tmp; + return io_allowed; +} + +static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) +{ + return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); +} + +static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +{ + unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); + + /* + * If config is updated while bios are still throttled, calculate and + * accumulate how many bytes/ios are waited across changes. And + * carryover_bytes/ios will be used to calculate new wait time under new + * configuration. + */ + if (bps_limit != U64_MAX) + tg->carryover_bytes[rw] += + calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + tg->bytes_disp[rw]; + if (iops_limit != UINT_MAX) + tg->carryover_ios[rw] += + calculate_io_allowed(iops_limit, jiffy_elapsed) - + tg->io_disp[rw]; +} + +static void tg_update_carryover(struct throtl_grp *tg) +{ + if (tg->service_queue.nr_queued[READ]) + __tg_update_carryover(tg, READ); + if (tg->service_queue.nr_queued[WRITE]) + __tg_update_carryover(tg, WRITE); + + /* see comments in struct throtl_grp for meaning of these fields. */ + throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, + tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], + tg->carryover_ios[READ], tg->carryover_ios[WRITE]); +} + +static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + if (iops_limit == UINT_MAX) { + if (wait) + *wait = 0; + return true; + } + + jiffy_elapsed = jiffies - tg->slice_start[rw]; + + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + + tg->carryover_ios[rw]; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; @@ -802,16 +854,16 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, return false; } -static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, - u64 bps_limit, unsigned long *wait) +static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes, tmp; + u64 bytes_allowed, extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); /* no need to throttle if this bio's bytes have been accounted */ - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { if (wait) *wait = 0; return true; @@ -824,11 +876,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - - tmp = bps_limit * jiffy_elapsed_rnd; - do_div(tmp, HZ); - bytes_allowed = tmp; - + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + + tg->carryover_bytes[rw]; if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) *wait = 0; @@ -889,7 +938,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw); + throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) @@ -897,8 +946,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && - tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { + if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && + tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) *wait = 0; return true; @@ -921,22 +970,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_THROTTLED)) { + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { tg->bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size; } tg->io_disp[rw]++; tg->last_io_disp[rw]++; - - /* - * BIO_THROTTLED is used to prevent the same bio to be throttled - * more than once as a throttled bio will go through blk-throtl the - * second time when it eventually gets issued. Set it when a bio - * is being charged to a tg. - */ - if (!bio_flagged(bio, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); } /** @@ -990,9 +1030,9 @@ static void tg_update_disptime(struct throtl_grp *tg) disptime = jiffies + min_wait; /* Update dispatch time */ - throtl_dequeue_tg(tg); + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; - throtl_enqueue_tg(tg); + tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; @@ -1026,6 +1066,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); + bio_set_flag(bio, BIO_BPS_THROTTLED); /* * If our parent is another tg, we just need to transfer @bio to @@ -1101,13 +1142,13 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) if (time_before(jiffies, tg->disptime)) break; - throtl_dequeue_tg(tg); - nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; - if (sq->nr_queued[0] || sq->nr_queued[1]) + if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); + else + throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; @@ -1321,8 +1362,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ - throtl_start_new_slice(tg, READ); - throtl_start_new_slice(tg, WRITE); + throtl_start_new_slice(tg, READ, false); + throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1350,6 +1391,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; @@ -1536,6 +1578,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, return ret; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); v[0] = tg->bps_conf[READ][index]; v[1] = tg->bps_conf[WRITE][index]; @@ -1673,6 +1716,41 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_free_fn = throtl_pd_free, }; +void blk_throtl_cancel_bios(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) { unsigned long rtime = jiffies, wtime = jiffies; @@ -1777,39 +1855,6 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) return false; } -void blk_throtl_cancel_bios(struct request_queue *q) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - spin_lock_irq(&q->queue_lock); - /* - * queue_lock is held, rcu lock is not needed here technically. - * However, rcu lock is still held to emphasize that following - * path need RCU protection and to prevent warning from lockdep. - */ - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - /* - * Set the flag to make sure throtl_pending_timer_fn() won't - * stop until all throttled bios are dispatched. - */ - blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; - /* - * Update disptime after setting the above flag to make sure - * throtl_select_dispatch() won't exit without dispatching. - */ - tg_update_disptime(tg); - - throtl_schedule_pending_timer(sq, jiffies + 1); - } - rcu_read_unlock(); - spin_unlock_irq(&q->queue_lock); -} - static bool throtl_can_upgrade(struct throtl_data *td, struct throtl_grp *this_tg) { @@ -2005,7 +2050,6 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) tg->checked_last_finish_time = last_finish_time; } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; @@ -2086,6 +2130,28 @@ static void throtl_update_latency_buckets(struct throtl_data *td) static inline void throtl_update_latency_buckets(struct throtl_data *td) { } + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + return false; +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ +} #endif bool __blk_throtl_bio(struct bio *bio) @@ -2159,8 +2225,10 @@ again: qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); - if (!tg) + if (!tg) { + bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; + } } /* out-of-limit, queue to @tg */ @@ -2189,8 +2257,6 @@ again: } out_unlock: - bio_set_flag(bio, BIO_THROTTLED); - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; @@ -2286,8 +2352,9 @@ void blk_throtl_bio_endio(struct bio *bio) } #endif -int blk_throtl_init(struct request_queue *q) +int blk_throtl_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int ret; @@ -2329,8 +2396,10 @@ int blk_throtl_init(struct request_queue *q) return ret; } -void blk_throtl_exit(struct request_queue *q) +void blk_throtl_exit(struct gendisk *disk) { + struct request_queue *q = disk->queue; + BUG_ON(!q->td); del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); @@ -2340,8 +2409,9 @@ void blk_throtl_exit(struct request_queue *q) kfree(q->td); } -void blk_throtl_register_queue(struct request_queue *q) +void blk_throtl_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int i; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index c1b602996127..ef4b7a4de987 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -55,8 +55,7 @@ struct throtl_service_queue { enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ - THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ - THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ + THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ }; enum { @@ -99,7 +98,8 @@ struct throtl_grp { unsigned int flags; /* are there any throtl rules between this group and td? */ - bool has_rules[2]; + bool has_rules_bps[2]; + bool has_rules_iops[2]; /* internally used bytes per second rate limits */ uint64_t bps[2][LIMIT_CNT]; @@ -121,6 +121,15 @@ struct throtl_grp { uint64_t last_bytes_disp[2]; unsigned int last_io_disp[2]; + /* + * The following two fields are updated when new configuration is + * submitted while some bios are still throttled, they record how many + * bytes/ios are waited already in previous configuration, and they will + * be used to calculate wait time under new configuration. + */ + uint64_t carryover_bytes[2]; + unsigned int carryover_ios[2]; + unsigned long last_check_time; unsigned long latency_target; /* us */ @@ -159,27 +168,37 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING -static inline int blk_throtl_init(struct request_queue *q) { return 0; } -static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline int blk_throtl_init(struct gendisk *disk) { return 0; } +static inline void blk_throtl_exit(struct gendisk *disk) { } +static inline void blk_throtl_register(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } -static inline void blk_throtl_cancel_bios(struct request_queue *q) { } +static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ -int blk_throtl_init(struct request_queue *q); -void blk_throtl_exit(struct request_queue *q); -void blk_throtl_register_queue(struct request_queue *q); +int blk_throtl_init(struct gendisk *disk); +void blk_throtl_exit(struct gendisk *disk); +void blk_throtl_register(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); -void blk_throtl_cancel_bios(struct request_queue *q); -static inline bool blk_throtl_bio(struct bio *bio) +void blk_throtl_cancel_bios(struct gendisk *disk); + +static inline bool blk_should_throtl(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + int rw = bio_data_dir(bio); - /* no need to throttle bps any more if the bio has been throttled */ - if (bio_flagged(bio, BIO_THROTTLED) && - !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) - return false; + /* iops limit is always counted */ + if (tg->has_rules_iops[rw]) + return true; + + if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) + return true; + + return false; +} + +static inline bool blk_throtl_bio(struct bio *bio) +{ - if (!tg->has_rules[bio_data_dir(bio)]) + if (!blk_should_throtl(bio)) return false; return __blk_throtl_bio(bio); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index a9982000b667..246467926253 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -843,6 +843,10 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + + wbt_queue_depth_changed(&rwb->rqos); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); /* * Assign rwb and add the stats callback. @@ -853,11 +857,6 @@ int wbt_init(struct request_queue *q) blk_stat_add_callback(q, rwb->cb); - rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_queue_depth_changed(&rwb->rqos); - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); - return 0; err_free: diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a264621d4905..db829401d8d0 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -63,13 +63,10 @@ bool blk_req_needs_zone_write_lock(struct request *rq) if (!rq->q->disk->seq_zones_wlock) return false; - switch (req_op(rq)) { - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: + if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq))) return blk_rq_zone_is_seq(rq); - default: - return false; - } + + return false; } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); diff --git a/block/blk.h b/block/blk.h index d7142c4d2fef..5350bf363035 100644 --- a/block/blk.h +++ b/block/blk.h @@ -270,8 +270,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, void blk_insert_flush(struct request *rq); -int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e); +int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); @@ -389,9 +388,9 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, } #ifdef CONFIG_BLK_CGROUP_IOLATENCY -extern int blk_iolatency_init(struct request_queue *q); +int blk_iolatency_init(struct gendisk *disk); #else -static inline int blk_iolatency_init(struct request_queue *q) { return 0; } +static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; #endif #ifdef CONFIG_BLK_DEV_ZONED diff --git a/block/elevator.c b/block/elevator.c index c319765892bb..bd71f0fc4e4b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -588,7 +588,7 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -int elevator_switch_mq(struct request_queue *q, +static int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e) { int ret; @@ -723,7 +723,7 @@ void elevator_init_mq(struct request_queue *q) * need for the new one. this way we have a chance of going back to the old * one, if the new one fails init for some reason. */ -static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { int err; diff --git a/block/genhd.c b/block/genhd.c index 988ba52fd331..514395361d7c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -627,7 +627,7 @@ void del_gendisk(struct gendisk *disk) blk_mq_freeze_queue_wait(q); - blk_throtl_cancel_bios(disk->queue); + blk_throtl_cancel_bios(disk); blk_sync_queue(q); blk_flush_integrity(); @@ -1151,7 +1151,8 @@ static void disk_release(struct device *dev) !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); - blkcg_exit_queue(disk->queue); + blkcg_exit_disk(disk); + bioset_exit(&disk->bio_split); disk_release_events(disk); @@ -1364,7 +1365,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - if (blkcg_init_queue(q)) + if (blkcg_init_disk(disk)) goto out_erase_part0; rand_initialize_disk(disk); diff --git a/block/opal_proto.h b/block/opal_proto.h index b486b3ec7dc4..7152aa1f1a49 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -39,7 +39,12 @@ enum opal_response_token { #define FIRST_TPER_SESSION_NUM 4096 #define TPER_SYNC_SUPPORTED 0x01 +/* FC_LOCKING features */ +#define LOCKING_SUPPORTED_MASK 0x01 +#define LOCKING_ENABLED_MASK 0x02 +#define LOCKED_MASK 0x04 #define MBR_ENABLED_MASK 0x10 +#define MBR_DONE_MASK 0x20 #define TINY_ATOM_DATA_MASK 0x3F #define TINY_ATOM_SIGNED 0x40 diff --git a/block/sed-opal.c b/block/sed-opal.c index 9700197000f2..2c5327a0543a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -74,8 +74,7 @@ struct parsed_resp { }; struct opal_dev { - bool supported; - bool mbr_enabled; + u32 flags; void *data; sec_send_recv *send_recv; @@ -280,6 +279,30 @@ static bool check_tper(const void *data) return true; } +static bool check_lcksuppt(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_SUPPORTED_MASK); +} + +static bool check_lckenabled(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_ENABLED_MASK); +} + +static bool check_locked(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKED_MASK); +} + static bool check_mbrenabled(const void *data) { const struct d0_locking_features *lfeat = data; @@ -288,6 +311,14 @@ static bool check_mbrenabled(const void *data) return !!(sup_feat & MBR_ENABLED_MASK); } +static bool check_mbrdone(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & MBR_DONE_MASK); +} + static bool check_sum(const void *data) { const struct d0_single_user_mode *sum = data; @@ -435,7 +466,7 @@ static int opal_discovery0_end(struct opal_dev *dev) u32 hlen = be32_to_cpu(hdr->length); print_buffer(dev->resp, hlen); - dev->mbr_enabled = false; + dev->flags &= OPAL_FL_SUPPORTED; if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", @@ -461,7 +492,16 @@ static int opal_discovery0_end(struct opal_dev *dev) check_geometry(dev, body); break; case FC_LOCKING: - dev->mbr_enabled = check_mbrenabled(body->features); + if (check_lcksuppt(body->features)) + dev->flags |= OPAL_FL_LOCKING_SUPPORTED; + if (check_lckenabled(body->features)) + dev->flags |= OPAL_FL_LOCKING_ENABLED; + if (check_locked(body->features)) + dev->flags |= OPAL_FL_LOCKED; + if (check_mbrenabled(body->features)) + dev->flags |= OPAL_FL_MBR_ENABLED; + if (check_mbrdone(body->features)) + dev->flags |= OPAL_FL_MBR_DONE; break; case FC_ENTERPRISE: case FC_DATASTORE: @@ -2109,7 +2149,8 @@ static int check_opal_support(struct opal_dev *dev) mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = opal_discovery0_step(dev); - dev->supported = !ret; + if (!ret) + dev->flags |= OPAL_FL_SUPPORTED; mutex_unlock(&dev->dev_lock); return ret; @@ -2148,6 +2189,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) INIT_LIST_HEAD(&dev->unlk_lst); mutex_init(&dev->dev_lock); + dev->flags = 0; dev->data = data; dev->send_recv = send_recv; if (check_opal_support(dev) != 0) { @@ -2528,7 +2570,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) if (!dev) return false; - if (!dev->supported) + if (!(dev->flags & OPAL_FL_SUPPORTED)) return false; mutex_lock(&dev->dev_lock); @@ -2546,7 +2588,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) was_failure = true; } - if (dev->mbr_enabled) { + if (dev->flags & OPAL_FL_MBR_ENABLED) { ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); if (ret) pr_debug("Failed to set MBR Done in S3 resume\n"); @@ -2620,6 +2662,23 @@ static int opal_generic_read_write_table(struct opal_dev *dev, return ret; } +static int opal_get_status(struct opal_dev *dev, void __user *data) +{ + struct opal_status sts = {0}; + + /* + * check_opal_support() error is not fatal, + * !dev->supported is a valid condition + */ + if (!check_opal_support(dev)) + sts.flags = dev->flags; + if (copy_to_user(data, &sts, sizeof(sts))) { + pr_debug("Error copying status to userspace\n"); + return -EFAULT; + } + return 0; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2629,12 +2688,14 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) return -EACCES; if (!dev) return -ENOTSUPP; - if (!dev->supported) + if (!(dev->flags & OPAL_FL_SUPPORTED)) return -ENOTSUPP; - p = memdup_user(arg, _IOC_SIZE(cmd)); - if (IS_ERR(p)) - return PTR_ERR(p); + if (cmd & IOC_IN) { + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); + } switch (cmd) { case IOC_OPAL_SAVE: @@ -2685,11 +2746,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GENERIC_TABLE_RW: ret = opal_generic_read_write_table(dev, p); break; + case IOC_OPAL_GET_STATUS: + ret = opal_get_status(dev, arg); + break; default: break; } - kfree(p); + if (cmd & IOC_IN) + kfree(p); return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); |