diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 09:19:14 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 09:19:14 -0700 |
commit | 513389809e138ae903b6ef43c1d5d2ffaf4dca17 (patch) | |
tree | c71e478fab1568da4706868b14eb67a75c148a8b | |
parent | 0a78a376ef3c2f3d397df48909f00cd75f92137a (diff) | |
parent | 30514bd2dd4e86a3ecfd6a93a3eadf7b9ea164a0 (diff) | |
download | linux-stable-513389809e138ae903b6ef43c1d5d2ffaf4dca17.tar.gz linux-stable-513389809e138ae903b6ef43c1d5d2ffaf4dca17.tar.bz2 linux-stable-513389809e138ae903b6ef43c1d5d2ffaf4dca17.zip |
Merge tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- NVMe pull requests via Christoph:
- handle number of queue changes in the TCP and RDMA drivers
(Daniel Wagner)
- allow changing the number of queues in nvmet (Daniel Wagner)
- also consider host_iface when checking ip options (Daniel
Wagner)
- don't map pages which can't come from HIGHMEM (Fabio M. De
Francesco)
- avoid unnecessary flush bios in nvmet (Guixin Liu)
- shrink and better pack the nvme_iod structure (Keith Busch)
- add comment for unaligned "fake" nqn (Linjun Bao)
- print actual source IP address through sysfs "address" attr
(Martin Belanger)
- various cleanups (Jackie Liu, Wolfram Sang, Genjian Zhang)
- handle effects after freeing the request (Keith Busch)
- copy firmware_rev on each init (Keith Busch)
- restrict management ioctls to admin (Keith Busch)
- ensure subsystem reset is single threaded (Keith Busch)
- report the actual number of tagset maps in nvme-pci (Keith
Busch)
- small fabrics authentication fixups (Christoph Hellwig)
- add common code for tagset allocation and freeing (Christoph
Hellwig)
- stop using the request_queue in nvmet (Christoph Hellwig)
- set min_align_mask before calculating max_hw_sectors (Rishabh
Bhatnagar)
- send a rediscover uevent when a persistent discovery controller
reconnects (Sagi Grimberg)
- misc nvmet-tcp fixes (Varun Prakash, zhenwei pi)
- MD pull request via Song:
- Various raid5 fix and clean up, by Logan Gunthorpe and David
Sloan.
- Raid10 performance optimization, by Yu Kuai.
- sbitmap wakeup hang fixes (Hugh, Keith, Jan, Yu)
- IO scheduler switching quisce fix (Keith)
- s390/dasd block driver updates (Stefan)
- support for recovery for the ublk driver (ZiyangZhang)
- rnbd drivers fixes and updates (Guoqing, Santosh, ye, Christoph)
- blk-mq and null_blk map fixes (Bart)
- various bcache fixes (Coly, Jilin, Jules)
- nbd signal hang fix (Shigeru)
- block writeback throttling fix (Yu)
- optimize the passthrough mapping handling (me)
- prepare block cgroups to being gendisk based (Christoph)
- get rid of an old PSI hack in the block layer, moving it to the
callers instead where it belongs (Christoph)
- blk-throttle fixes and cleanups (Yu)
- misc fixes and cleanups (Liu Shixin, Liu Song, Miaohe, Pankaj,
Ping-Xiang, Wolfram, Saurabh, Li Jinlin, Li Lei, Lin, Li zeming,
Miaohe, Bart, Coly, Gaosheng
* tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux: (162 commits)
sbitmap: fix lockup while swapping
block: add rationale for not using blk_mq_plug() when applicable
block: adapt blk_mq_plug() to not plug for writes that require a zone lock
s390/dasd: use blk_mq_alloc_disk
blk-cgroup: don't update the blkg lookup hint in blkg_conf_prep
nvmet: don't look at the request_queue in nvmet_bdev_set_limits
nvmet: don't look at the request_queue in nvmet_bdev_zone_mgmt_emulate_all
blk-mq: use quiesced elevator switch when reinitializing queues
block: replace blk_queue_nowait with bdev_nowait
nvme: remove nvme_ctrl_init_connect_q
nvme-loop: use the tagset alloc/free helpers
nvme-loop: store the generic nvme_ctrl in set->driver_data
nvme-loop: initialize sqsize later
nvme-fc: use the tagset alloc/free helpers
nvme-fc: store the generic nvme_ctrl in set->driver_data
nvme-fc: keep ctrl->sqsize in sync with opts->queue_size
nvme-rdma: use the tagset alloc/free helpers
nvme-rdma: store the generic nvme_ctrl in set->driver_data
nvme-tcp: use the tagset alloc/free helpers
nvme-tcp: store the generic nvme_ctrl in set->driver_data
...
135 files changed, 3204 insertions, 1644 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 09de72093af9..700aa009f56a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14582,6 +14582,15 @@ F: drivers/nvme/common/ F: include/linux/nvme* F: include/uapi/linux/nvme_ioctl.h +NVM EXPRESS FABRICS AUTHENTICATION +M: Hannes Reinecke <hare@suse.de> +L: linux-nvme@lists.infradead.org +S: Supported +F: drivers/nvme/host/auth.c +F: drivers/nvme/target/auth.c +F: drivers/nvme/target/fabrics-cmd-auth.c +F: include/linux/nvme-auth.h + NVM EXPRESS FC TRANSPORT DRIVERS M: James Smart <james.smart@broadcom.com> L: linux-nvme@lists.infradead.org diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h index 7ce584aff5bb..322bdcd4b616 100644 --- a/arch/s390/include/asm/scsw.h +++ b/arch/s390/include/asm/scsw.h @@ -215,6 +215,11 @@ union scsw { #define SNS2_ENV_DATA_PRESENT 0x10 #define SNS2_INPRECISE_END 0x04 +/* + * architectured values for PPRC errors + */ +#define SNS7_INVALID_ON_SEC 0x0e + /** * scsw_is_tm - check for transport mode scsw * @scsw: pointer to scsw diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 9ec86fae9980..93d1ccd3304c 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -183,6 +183,18 @@ typedef struct format_data_t { } format_data_t; /* + * struct dasd_copypair_swap_data_t + * represents all data necessary to issue a swap of the copy pair relation + */ +struct dasd_copypair_swap_data_t { + char primary[20]; /* BUSID of primary */ + char secondary[20]; /* BUSID of secondary */ + + /* Reserved for future updates. */ + __u8 reserved[64]; +}; + +/* * values to be used for format_data_t.intensity * 0/8: normal format * 1/9: also write record zero @@ -326,6 +338,8 @@ struct dasd_snid_ioctl_data { #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) /* Release Allocated Space */ #define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t) +/* Swap copy pair relation */ +#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t) /* Get Sense Path Group ID (SNID) data */ #define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 30b15a9a47c4..144bca006463 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -254,17 +254,12 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, #else /* CONFIG_BFQ_CGROUP_DEBUG */ -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - blk_opf_t opf) { } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c740b41fe0a4..7ea427817f7f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1925,7 +1925,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfqq->service_from_backlogged = 0; bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); + bfq_add_bfqq_busy(bfqq); /* * Expire in-service queue if preemption may be needed for @@ -2419,7 +2419,7 @@ static void bfq_remove_request(struct request_queue *q, bfqq->next_rq = NULL; if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); /* * bfqq emptied. In normal operation, when * bfqq is empty, bfqq->entity.service and @@ -3098,7 +3098,7 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) */ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq != bfqd->in_service_queue) - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); bfq_reassign_last_bfqq(bfqq, NULL); @@ -3908,7 +3908,7 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, true); + bfq_del_bfqq_busy(bfqq, true); } else { bfq_requeue_bfqq(bfqd, bfqq, true); /* @@ -5255,9 +5255,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) struct hlist_node *n; struct bfq_group *bfqg = bfqq_group(bfqq); - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", - bfqq, bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); bfqq->ref--; if (bfqq->ref) @@ -5321,7 +5319,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_del_init(&item->woken_list_node); } - if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) + if (bfqq->bfqd->last_completed_rq_bfqq == bfqq) bfqq->bfqd->last_completed_rq_bfqq = NULL; kmem_cache_free(bfq_pool, bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ad8e513d7e87..64ee618064ba 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -993,20 +993,23 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - blk_opf_t opf); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf); void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -void bfqg_stats_update_idle_time(struct bfq_group *bfqg); void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_CGROUP_DEBUG +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + blk_opf_t opf); +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); +void bfqg_stats_update_idle_time(struct bfq_group *bfqg); +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); +#endif + void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); @@ -1077,9 +1080,8 @@ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration); -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); +void bfq_add_bfqq_busy(struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 983413cdefad..8fc3da4c23bb 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1651,9 +1651,10 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, * the service tree. As a special case, it can be invoked during an * expiration. */ -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration) +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "del from busy"); bfq_clear_bfqq_busy(bfqq); @@ -1674,8 +1675,10 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* * Called when an inactive queue receives a new request. */ -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +void bfq_add_bfqq_busy(struct bfq_queue *bfqq) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "add to busy"); bfq_activate_bfqq(bfqd, bfqq); diff --git a/block/bio.c b/block/bio.c index 3d3a2678fea2..7cb7d2ff139b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -760,8 +760,6 @@ EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); - if (bio_flagged(bio_src, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; @@ -1065,9 +1063,6 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1276,9 +1271,6 @@ out: * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1294,8 +1286,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -1754,7 +1744,8 @@ static int __init init_bio(void) cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); - if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, + BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 869af9d72bcf..6a5c849ee061 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -202,19 +202,19 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with - * @q: request_queue the new blkg is associated with + * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg assocating @blkcg and @q. */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; @@ -225,10 +225,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg->iostat_cpu) goto err_free; - if (!blk_get_queue(q)) + if (!blk_get_queue(disk->queue)) goto err_free; - blkg->q = q; + blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); @@ -243,11 +243,11 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; - if (!blkcg_policy_enabled(q, pol)) + if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); + pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg); if (!pd) goto err_free; @@ -263,45 +263,20 @@ err_free: return NULL; } -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint) -{ - struct blkcg_gq *blkg; - - /* - * Hint didn't match. Look up from the radix tree. Note that the - * hint can only be updated under queue_lock as otherwise @blkg - * could have already been removed from blkg_tree. The caller is - * responsible for grabbing queue_lock if @update_hint. - */ - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) { - if (update_hint) { - lockdep_assert_held(&q->queue_lock); - rcu_assign_pointer(blkcg->blkg_hint, blkg); - } - return blkg; - } - - return NULL; -} -EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); - /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ -static struct blkcg_gq *blkg_create(struct blkcg *blkcg, - struct request_queue *q, +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; - lockdep_assert_held(&q->queue_lock); + lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ - if (blk_queue_dying(q)) { + if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } @@ -314,7 +289,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; @@ -324,7 +299,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* link parent */ if (blkcg_parent(blkcg)) { - blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; @@ -342,10 +317,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* insert */ spin_lock(&blkcg->lock); - ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &q->blkg_list); + list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -374,19 +349,20 @@ err_free_blkg: /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest - * @q: request_queue of interest + * @disk: gendisk of interest * - * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and takes @q->queue_lock. + * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) + struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; @@ -397,9 +373,13 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup(blkcg, q, true); - if (blkg) + blkg = blkg_lookup(blkcg, q); + if (blkg) { + if (blkcg != &blkcg_root && + blkg != rcu_dereference(blkcg->blkg_hint)) + rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; + } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all @@ -412,7 +392,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { - blkg = __blkg_lookup(parent, q, false); + blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; @@ -422,7 +402,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, parent = blkcg_parent(parent); } - blkg = blkg_create(pos, q, NULL); + blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; @@ -476,14 +456,9 @@ static void blkg_destroy(struct blkcg_gq *blkg) percpu_ref_kill(&blkg->refcnt); } -/** - * blkg_destroy_all - destroy all blkgs associated with a request_queue - * @q: request_queue of interest - * - * Destroy all blkgs associated with @q. - */ -static void blkg_destroy_all(struct request_queue *q) +static void blkg_destroy_all(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg, *n; int count = BLKG_DESTROY_BATCH_SIZE; @@ -616,19 +591,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/* Performs queue bypass and policy enabled checks then looks up blkg. */ -static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, - const struct blkcg_policy *pol, - struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) - return ERR_PTR(-EOPNOTSUPP); - return __blkg_lookup(blkcg, q, true /* update_hint */); -} - /** * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer @@ -684,6 +646,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) { struct block_device *bdev; + struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; @@ -691,8 +654,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, bdev = blkcg_conf_open_bdev(&input); if (IS_ERR(bdev)) return PTR_ERR(bdev); - - q = bdev_get_queue(bdev); + disk = bdev->bd_disk; + q = disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab @@ -705,12 +668,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(blkcg, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { + ret = -EOPNOTSUPP; goto fail_unlock; } + blkg = blkg_lookup(blkcg, q); if (blkg) goto success; @@ -724,7 +687,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); - while (parent && !__blkg_lookup(parent, q, false)) { + while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } @@ -733,7 +696,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); - new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; @@ -748,17 +711,17 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(pos, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); + ret = -EOPNOTSUPP; goto fail_preloaded; } + blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { - blkg = blkg_create(pos, q, new_blkg); + blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; @@ -915,8 +878,7 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); - struct blkcg_gq *blkg = - blk_queue_root_blkg(bdev_get_queue(bdev)); + struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; @@ -1255,25 +1217,16 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } -/** - * blkcg_init_queue - initialize blkcg part of request queue - * @q: request_queue to initialize - * - * Called from blk_alloc_queue(). Responsible for initializing blkcg - * part of new request_queue @q. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int blkcg_init_queue(struct request_queue *q) +int blkcg_init_disk(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; int ret; INIT_LIST_HEAD(&q->blkg_list); - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; @@ -1282,7 +1235,7 @@ int blkcg_init_queue(struct request_queue *q) /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); - blkg = blkg_create(&blkcg_root, q, new_blkg); + blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; @@ -1291,25 +1244,26 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); - ret = blk_ioprio_init(q); + ret = blk_ioprio_init(disk); if (ret) goto err_destroy_all; - ret = blk_throtl_init(q); + ret = blk_throtl_init(disk); if (ret) - goto err_destroy_all; + goto err_ioprio_exit; - ret = blk_iolatency_init(q); - if (ret) { - blk_throtl_exit(q); - blk_ioprio_exit(q); - goto err_destroy_all; - } + ret = blk_iolatency_init(disk); + if (ret) + goto err_throtl_exit; return 0; +err_throtl_exit: + blk_throtl_exit(disk); +err_ioprio_exit: + blk_ioprio_exit(disk); err_destroy_all: - blkg_destroy_all(q); + blkg_destroy_all(disk); return ret; err_unlock: spin_unlock_irq(&q->queue_lock); @@ -1318,16 +1272,10 @@ err_unlock: return PTR_ERR(blkg); } -/** - * blkcg_exit_queue - exit and release blkcg part of request_queue - * @q: request_queue being released - * - * Called from blk_exit_queue(). Responsible for exiting blkcg part. - */ -void blkcg_exit_queue(struct request_queue *q) +void blkcg_exit_disk(struct gendisk *disk) { - blkg_destroy_all(q); - blk_throtl_exit(q); + blkg_destroy_all(disk); + blk_throtl_exit(disk); } static void blkcg_bind(struct cgroup_subsys_state *root_css) @@ -1836,13 +1784,13 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q: the request queue IO was submitted on + * @gendisk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for - * instance will only have a request_queue at that point. This set's the + * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * @@ -1851,8 +1799,10 @@ out: * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { + struct request_queue *q = disk->queue; + if (unlikely(current->flags & PF_KTHREAD)) return; @@ -1902,8 +1852,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), - bdev_get_queue(bio->bi_bdev)); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index d2724d1dd7c9..aa2b286bc825 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -178,10 +178,8 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint); -int blkcg_init_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); +int blkcg_init_disk(struct gendisk *disk); +void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); @@ -227,22 +225,21 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio) } /** - * __blkg_lookup - internal version of blkg_lookup() + * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. + * Lookup blkg for the @blkcg - @q pair. + + * Must be called in a RCU critical section. */ -static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q, - bool update_hint) +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; + WARN_ON_ONCE(!rcu_read_lock_held()); + if (blkcg == &blkcg_root) return q->root_blkg; @@ -250,33 +247,10 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, if (blkg && blkg->q == q) return blkg; - return blkg_lookup_slowpath(blkcg, q, update_hint); -} - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock. - */ -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __blkg_lookup(blkcg, q, false); -} - -/** - * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair - * @q: request_queue of interest - * - * Lookup blkg for @q at the root level. See also blkg_lookup(). - */ -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ - return q->root_blkg; + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q != q) + blkg = NULL; + return blkg; } /** @@ -373,8 +347,8 @@ static inline void blkg_put(struct blkcg_gq *blkg) */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants @@ -388,8 +362,8 @@ static inline void blkg_put(struct blkcg_gq *blkg) */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) bool __blkcg_punt_bio_submit(struct bio *bio); @@ -507,10 +481,8 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } +static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct request_queue *q, diff --git a/block/blk-core.c b/block/blk-core.c index 651057c4146b..17667159482e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -37,7 +37,6 @@ #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> -#include <linux/psi.h> #include <linux/part_stat.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> @@ -487,18 +486,15 @@ static int __init fail_make_request_debugfs(void) late_initcall(fail_make_request_debugfs); #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio) +static inline void bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) - return false; + return; pr_warn("Trying to write to read-only block-device %pg\n", bio->bi_bdev); /* Older lvm-tools actually trigger this */ - return false; } - - return false; } static noinline int should_fail_bio(struct bio *bio) @@ -717,13 +713,12 @@ void submit_bio_noacct(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ - if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) goto end_io; - if (unlikely(bio_check_ro(bio))) - goto end_io; + bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; @@ -814,7 +809,7 @@ EXPORT_SYMBOL(submit_bio_noacct); * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback - * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has + * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has * been called. */ void submit_bio(struct bio *bio) @@ -829,22 +824,6 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } - /* - * If we're reading data that is part of the userspace workingset, count - * submission time as memory stall. When the device is congested, or - * the submitting cgroup IO-throttled, submission can be a significant - * part of overall IO time. - */ - if (unlikely(bio_op(bio) == REQ_OP_READ && - bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; - - psi_memstall_enter(&pflags); - submit_bio_noacct(bio); - psi_memstall_leave(&pflags); - return; - } - submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); @@ -871,6 +850,12 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; + /* + * As the requests that require a zone lock are not plugged in the + * first place, directly accessing the plug instead of using + * blk_mq_plug() should not have any consequences during flushing for + * zoned devices. + */ blk_flush_plug(current->plug, false); if (bio_queue_enter(bio)) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7936e5f5821c..495396425bad 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -664,17 +664,13 @@ static struct ioc *q_to_ioc(struct request_queue *q) return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } -static const char *q_name(struct request_queue *q) -{ - if (blk_queue_registered(q)) - return kobject_name(q->kobj.parent); - else - return "<unknown>"; -} - static const char __maybe_unused *ioc_name(struct ioc *ioc) { - return q_name(ioc->rqos.q); + struct gendisk *disk = ioc->rqos.q->disk; + + if (!disk) + return "<unknown>"; + return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) @@ -1430,7 +1426,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); - struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; @@ -2640,7 +2636,7 @@ retry_lock: if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; @@ -2741,7 +2737,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); @@ -2832,8 +2828,9 @@ static struct rq_qos_ops ioc_rqos_ops = { .exit = ioc_rqos_exit, }; -static int blk_iocost_init(struct request_queue *q) +static int blk_iocost_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct ioc *ioc; struct rq_qos *rqos; int i, cpu, ret; @@ -3170,6 +3167,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct block_device *bdev; + struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; @@ -3180,12 +3178,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (IS_ERR(bdev)) return PTR_ERR(bdev); - ioc = q_to_ioc(bdev_get_queue(bdev)); + disk = bdev->bd_disk; + ioc = q_to_ioc(disk->queue); if (!ioc) { - ret = blk_iocost_init(bdev_get_queue(bdev)); + ret = blk_iocost_init(disk); if (ret) goto err; - ioc = q_to_ioc(bdev_get_queue(bdev)); + ioc = q_to_ioc(disk->queue); } spin_lock_irq(&ioc->lock); @@ -3262,11 +3261,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, spin_lock_irq(&ioc->lock); if (enable) { - blk_stat_enable_accounting(ioc->rqos.q); - blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + blk_stat_enable_accounting(disk->queue); + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; } else { - blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } @@ -3349,7 +3348,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(bdev_get_queue(bdev)); if (!ioc) { - ret = blk_iocost_init(bdev_get_queue(bdev)); + ret = blk_iocost_init(bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(bdev_get_queue(bdev)); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index e285152345a2..571fa95aafe9 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) - blkcg_schedule_throttle(rqos->q, use_memdelay); + blkcg_schedule_throttle(rqos->q->disk, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are @@ -756,8 +756,9 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) } } -int blk_iolatency_init(struct request_queue *q) +int blk_iolatency_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_iolatency *blkiolat; struct rq_qos *rqos; int ret; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index c00060a02c6e..8bb6b8eba4ce 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -202,14 +202,14 @@ void blkcg_set_ioprio(struct bio *bio) bio->bi_ioprio = prio; } -void blk_ioprio_exit(struct request_queue *q) +void blk_ioprio_exit(struct gendisk *disk) { - blkcg_deactivate_policy(q, &ioprio_policy); + blkcg_deactivate_policy(disk->queue, &ioprio_policy); } -int blk_ioprio_init(struct request_queue *q) +int blk_ioprio_init(struct gendisk *disk) { - return blkcg_activate_policy(q, &ioprio_policy); + return blkcg_activate_policy(disk->queue, &ioprio_policy); } static int __init ioprio_init(void) diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index 5a1eb550e178..b6afb8e80de0 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -9,15 +9,15 @@ struct request_queue; struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO -int blk_ioprio_init(struct request_queue *q); -void blk_ioprio_exit(struct request_queue *q); +int blk_ioprio_init(struct gendisk *disk); +void blk_ioprio_exit(struct gendisk *disk); void blkcg_set_ioprio(struct bio *bio); #else -static inline int blk_ioprio_init(struct request_queue *q) +static inline int blk_ioprio_init(struct gendisk *disk) { return 0; } -static inline void blk_ioprio_exit(struct request_queue *q) +static inline void blk_ioprio_exit(struct gendisk *disk) { } static inline void blkcg_set_ioprio(struct bio *bio) diff --git a/block/blk-map.c b/block/blk-map.c index 7196a6b64c80..7693f8e3c454 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -158,7 +158,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { - nr_pages = 1 << map_data->page_order; + nr_pages = 1U << map_data->page_order; i = map_data->offset / PAGE_SIZE; } while (len) { @@ -231,6 +231,16 @@ out_bmd: return ret; } +static void bio_map_put(struct bio *bio) +{ + if (bio->bi_opf & REQ_ALLOC_CACHE) { + bio_put(bio); + } else { + bio_uninit(bio); + kfree(bio); + } +} + static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { @@ -243,18 +253,34 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(nr_vecs, gfp_mask); - if (!bio) - return -ENOMEM; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + if (rq->cmd_flags & REQ_POLLED) { + blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, + &fs_bio_set); + if (!bio) + return -ENOMEM; + } else { + bio = bio_kmalloc(nr_vecs, gfp_mask); + if (!bio) + return -ENOMEM; + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + } while (iov_iter_count(iter)) { - struct page **pages; + struct page **pages, *stack_pages[UIO_FASTIOV]; ssize_t bytes; - size_t offs, added = 0; + size_t offs; int npages; - bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); + if (nr_vecs <= ARRAY_SIZE(stack_pages)) { + pages = stack_pages; + bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, + nr_vecs, &offs); + } else { + bytes = iov_iter_get_pages_alloc2(iter, &pages, + LONG_MAX, &offs); + } if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -280,7 +306,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, break; } - added += n; bytes -= n; offs = 0; } @@ -290,7 +315,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, */ while (j < npages) put_page(pages[j++]); - kvfree(pages); + if (pages != stack_pages) + kvfree(pages); /* couldn't stuff something into bio? */ if (bytes) { iov_iter_revert(iter, bytes); @@ -305,8 +331,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_uninit(bio); - kfree(bio); + bio_map_put(bio); return ret; } @@ -611,8 +636,7 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_uninit(next_bio); - kfree(next_bio); + bio_map_put(next_bio); } return ret; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 3db84d3197f1..9c2fce1a7b50 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -32,7 +32,7 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_map_queues(struct blk_mq_queue_map *qmap) +void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { unsigned int *map = qmap->mq_map; unsigned int nr_queues = qmap->nr_queues; @@ -70,8 +70,6 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap) map[cpu] = map[first_sibling]; } } - - return 0; } EXPORT_SYMBOL_GPL(blk_mq_map_queues); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index dee789f2f98f..bd942341b638 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -807,8 +807,6 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; - case RQ_QOS_IOPRIO: - return "ioprio"; } return "unknown"; } diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index b595a94c4d16..a90b88fd1332 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -23,8 +23,8 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset) +void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset) { const struct cpumask *mask; unsigned int queue, cpu; @@ -38,11 +38,10 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; fallback: WARN_ON_ONCE(qmap->nr_queues > 1); blk_mq_clear_mq_map(qmap); - return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c index 14f968e58b8f..29c1f4d6eb04 100644 --- a/block/blk-mq-rdma.c +++ b/block/blk-mq-rdma.c @@ -21,7 +21,7 @@ * @set->nr_hw_queues, or @dev does not provide an affinity mask for a * vector, we fallback to the naive mapping. */ -int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, +void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec) { const struct cpumask *mask; @@ -36,9 +36,9 @@ int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, map->mq_map[cpu] = map->queue_offset + queue; } - return 0; + return; fallback: - return blk_mq_map_queues(map); + blk_mq_map_queues(map); } EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8e3b36d1cb57..9eb968e14d31 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * other allocations on previous queue won't be starved. */ if (bt != bt_prev) - sbitmap_queue_wake_up(bt_prev); + sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index 7b8a42c35102..6589f076a096 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -21,7 +21,7 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, +void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec) { const struct cpumask *mask; @@ -39,8 +39,9 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; + fallback: - return blk_mq_map_queues(qmap); + blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 0df20184cc3e..83492d942348 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1093,10 +1093,12 @@ bool blk_mq_complete_request_remote(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* - * For a polled request, always complete locally, it's pointless - * to redirect the completion. + * For request which hctx has only one ctx mapping, + * or a polled request, always complete locally, + * it's pointless to redirect the completion. */ - if (rq->cmd_flags & REQ_POLLED) + if (rq->mq_hctx->nr_ctx == 1 || + rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { @@ -1213,6 +1215,12 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); + + /* + * As plugging can be enabled for passthrough requests on a zoned + * device, directly accessing the plug instead of using blk_mq_plug() + * should not have any consequences. + */ if (current->plug) blk_add_rq_to_plug(current->plug, rq); else @@ -1993,7 +2001,7 @@ out: if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && needs_resource) + else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); @@ -4192,7 +4200,7 @@ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) return 0; } -static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) +static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations @@ -4222,10 +4230,10 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); - return set->ops->map_queues(set); + set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); - return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } @@ -4324,9 +4332,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; } - ret = blk_mq_update_queue_map(set); - if (ret) - goto out_free_mq_map; + blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) @@ -4474,14 +4480,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head, list_add(&qe->node, head); /* - * After elevator_switch_mq, the previous elevator_queue will be + * After elevator_switch, the previous elevator_queue will be * released by elevator_release. The reference of the io scheduler * module get by elevator_get will also be put. So we need to get * a reference of the io scheduler module here to prevent it to be * removed. */ __module_get(qe->type->elevator_owner); - elevator_switch_mq(q, NULL); + elevator_switch(q, NULL); mutex_unlock(&q->sysfs_lock); return true; @@ -4513,7 +4519,7 @@ static void blk_mq_elv_switch_back(struct list_head *head, kfree(qe); mutex_lock(&q->sysfs_lock); - elevator_switch_mq(q, t); + elevator_switch(q, t); mutex_unlock(&q->sysfs_lock); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 8ca453ac243d..0b2870839cdd 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -312,7 +312,8 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) static inline struct blk_plug *blk_mq_plug( struct bio *bio) { /* Zoned block device write operation case: do not plug the BIO */ - if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio))) + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) return NULL; /* diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 08b856570ad1..1ef1f7d4bc3c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,7 +17,6 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, - RQ_QOS_IOPRIO, }; struct rq_wait { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e1f009aba6fd..e71b3b43927c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -844,7 +844,7 @@ int blk_register_queue(struct gendisk *disk) blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); - blk_throtl_register_queue(q); + blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&q->kobj, KOBJ_ADD); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9f5fe62afff9..847721dc2b2b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -329,8 +329,8 @@ static struct bio *throtl_pop_queued(struct list_head *queued, /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { - INIT_LIST_HEAD(&sq->queued[0]); - INIT_LIST_HEAD(&sq->queued[1]); + INIT_LIST_HEAD(&sq->queued[READ]); + INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } @@ -420,24 +420,17 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_data *td = tg->td; int rw; - int has_iops_limit = 0; for (rw = READ; rw <= WRITE; rw++) { - unsigned int iops_limit = tg_iops_limit(tg, rw); - - tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || + tg->has_rules_iops[rw] = + (parent_tg && parent_tg->has_rules_iops[rw]) || (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX || - iops_limit != UINT_MAX)); - - if (iops_limit != UINT_MAX) - has_iops_limit = 1; + tg_iops_limit(tg, rw) != UINT_MAX); + tg->has_rules_bps[rw] = + (parent_tg && parent_tg->has_rules_bps[rw]) || + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX)); } - - if (has_iops_limit) - tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; - else - tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; } static void throtl_pd_online(struct blkg_policy_data *pd) @@ -520,7 +513,6 @@ static void throtl_rb_erase(struct rb_node *n, { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); - --parent_sq->nr_pending; } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) @@ -572,7 +564,11 @@ static void throtl_enqueue_tg(struct throtl_grp *tg) static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { - throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); + struct throtl_service_queue *parent_sq = + tg->service_queue.parent_sq; + + throtl_rb_erase(&tg->rb_node, parent_sq); + --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } @@ -639,6 +635,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -656,12 +654,17 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->slice_end[rw], jiffies); } -static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, + bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + if (clear_carryover) { + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; + } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -754,33 +757,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw], tg->slice_end[rw], jiffies); } -static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, - u32 iops_limit, unsigned long *wait) +static unsigned int calculate_io_allowed(u32 iops_limit, + unsigned long jiffy_elapsed) { - bool rw = bio_data_dir(bio); unsigned int io_allowed; - unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; - if (iops_limit == UINT_MAX) { - if (wait) - *wait = 0; - return true; - } - - jiffy_elapsed = jiffies - tg->slice_start[rw]; - - /* Round up to the next throttle slice, wait time must be nonzero */ - jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - /* - * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ - tmp = (u64)iops_limit * jiffy_elapsed_rnd; + tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -788,6 +778,68 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, else io_allowed = tmp; + return io_allowed; +} + +static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) +{ + return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); +} + +static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +{ + unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); + + /* + * If config is updated while bios are still throttled, calculate and + * accumulate how many bytes/ios are waited across changes. And + * carryover_bytes/ios will be used to calculate new wait time under new + * configuration. + */ + if (bps_limit != U64_MAX) + tg->carryover_bytes[rw] += + calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + tg->bytes_disp[rw]; + if (iops_limit != UINT_MAX) + tg->carryover_ios[rw] += + calculate_io_allowed(iops_limit, jiffy_elapsed) - + tg->io_disp[rw]; +} + +static void tg_update_carryover(struct throtl_grp *tg) +{ + if (tg->service_queue.nr_queued[READ]) + __tg_update_carryover(tg, READ); + if (tg->service_queue.nr_queued[WRITE]) + __tg_update_carryover(tg, WRITE); + + /* see comments in struct throtl_grp for meaning of these fields. */ + throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, + tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], + tg->carryover_ios[READ], tg->carryover_ios[WRITE]); +} + +static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + if (iops_limit == UINT_MAX) { + if (wait) + *wait = 0; + return true; + } + + jiffy_elapsed = jiffies - tg->slice_start[rw]; + + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + + tg->carryover_ios[rw]; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; @@ -802,16 +854,16 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, return false; } -static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, - u64 bps_limit, unsigned long *wait) +static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes, tmp; + u64 bytes_allowed, extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); /* no need to throttle if this bio's bytes have been accounted */ - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { if (wait) *wait = 0; return true; @@ -824,11 +876,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - - tmp = bps_limit * jiffy_elapsed_rnd; - do_div(tmp, HZ); - bytes_allowed = tmp; - + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + + tg->carryover_bytes[rw]; if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) *wait = 0; @@ -889,7 +938,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw); + throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) @@ -897,8 +946,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && - tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { + if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && + tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) *wait = 0; return true; @@ -921,22 +970,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_THROTTLED)) { + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { tg->bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size; } tg->io_disp[rw]++; tg->last_io_disp[rw]++; - - /* - * BIO_THROTTLED is used to prevent the same bio to be throttled - * more than once as a throttled bio will go through blk-throtl the - * second time when it eventually gets issued. Set it when a bio - * is being charged to a tg. - */ - if (!bio_flagged(bio, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); } /** @@ -990,9 +1030,9 @@ static void tg_update_disptime(struct throtl_grp *tg) disptime = jiffies + min_wait; /* Update dispatch time */ - throtl_dequeue_tg(tg); + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; - throtl_enqueue_tg(tg); + tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; @@ -1026,6 +1066,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); + bio_set_flag(bio, BIO_BPS_THROTTLED); /* * If our parent is another tg, we just need to transfer @bio to @@ -1101,13 +1142,13 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) if (time_before(jiffies, tg->disptime)) break; - throtl_dequeue_tg(tg); - nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; - if (sq->nr_queued[0] || sq->nr_queued[1]) + if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); + else + throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; @@ -1321,8 +1362,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ - throtl_start_new_slice(tg, READ); - throtl_start_new_slice(tg, WRITE); + throtl_start_new_slice(tg, READ, false); + throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1350,6 +1391,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; @@ -1536,6 +1578,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, return ret; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); v[0] = tg->bps_conf[READ][index]; v[1] = tg->bps_conf[WRITE][index]; @@ -1673,6 +1716,41 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_free_fn = throtl_pd_free, }; +void blk_throtl_cancel_bios(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) { unsigned long rtime = jiffies, wtime = jiffies; @@ -1777,39 +1855,6 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) return false; } -void blk_throtl_cancel_bios(struct request_queue *q) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - spin_lock_irq(&q->queue_lock); - /* - * queue_lock is held, rcu lock is not needed here technically. - * However, rcu lock is still held to emphasize that following - * path need RCU protection and to prevent warning from lockdep. - */ - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - /* - * Set the flag to make sure throtl_pending_timer_fn() won't - * stop until all throttled bios are dispatched. - */ - blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; - /* - * Update disptime after setting the above flag to make sure - * throtl_select_dispatch() won't exit without dispatching. - */ - tg_update_disptime(tg); - - throtl_schedule_pending_timer(sq, jiffies + 1); - } - rcu_read_unlock(); - spin_unlock_irq(&q->queue_lock); -} - static bool throtl_can_upgrade(struct throtl_data *td, struct throtl_grp *this_tg) { @@ -2005,7 +2050,6 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) tg->checked_last_finish_time = last_finish_time; } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; @@ -2086,6 +2130,28 @@ static void throtl_update_latency_buckets(struct throtl_data *td) static inline void throtl_update_latency_buckets(struct throtl_data *td) { } + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + return false; +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ +} #endif bool __blk_throtl_bio(struct bio *bio) @@ -2159,8 +2225,10 @@ again: qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); - if (!tg) + if (!tg) { + bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; + } } /* out-of-limit, queue to @tg */ @@ -2189,8 +2257,6 @@ again: } out_unlock: - bio_set_flag(bio, BIO_THROTTLED); - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; @@ -2286,8 +2352,9 @@ void blk_throtl_bio_endio(struct bio *bio) } #endif -int blk_throtl_init(struct request_queue *q) +int blk_throtl_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int ret; @@ -2329,8 +2396,10 @@ int blk_throtl_init(struct request_queue *q) return ret; } -void blk_throtl_exit(struct request_queue *q) +void blk_throtl_exit(struct gendisk *disk) { + struct request_queue *q = disk->queue; + BUG_ON(!q->td); del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); @@ -2340,8 +2409,9 @@ void blk_throtl_exit(struct request_queue *q) kfree(q->td); } -void blk_throtl_register_queue(struct request_queue *q) +void blk_throtl_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int i; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index c1b602996127..ef4b7a4de987 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -55,8 +55,7 @@ struct throtl_service_queue { enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ - THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ - THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ + THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ }; enum { @@ -99,7 +98,8 @@ struct throtl_grp { unsigned int flags; /* are there any throtl rules between this group and td? */ - bool has_rules[2]; + bool has_rules_bps[2]; + bool has_rules_iops[2]; /* internally used bytes per second rate limits */ uint64_t bps[2][LIMIT_CNT]; @@ -121,6 +121,15 @@ struct throtl_grp { uint64_t last_bytes_disp[2]; unsigned int last_io_disp[2]; + /* + * The following two fields are updated when new configuration is + * submitted while some bios are still throttled, they record how many + * bytes/ios are waited already in previous configuration, and they will + * be used to calculate wait time under new configuration. + */ + uint64_t carryover_bytes[2]; + unsigned int carryover_ios[2]; + unsigned long last_check_time; unsigned long latency_target; /* us */ @@ -159,27 +168,37 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING -static inline int blk_throtl_init(struct request_queue *q) { return 0; } -static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline int blk_throtl_init(struct gendisk *disk) { return 0; } +static inline void blk_throtl_exit(struct gendisk *disk) { } +static inline void blk_throtl_register(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } -static inline void blk_throtl_cancel_bios(struct request_queue *q) { } +static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ -int blk_throtl_init(struct request_queue *q); -void blk_throtl_exit(struct request_queue *q); -void blk_throtl_register_queue(struct request_queue *q); +int blk_throtl_init(struct gendisk *disk); +void blk_throtl_exit(struct gendisk *disk); +void blk_throtl_register(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); -void blk_throtl_cancel_bios(struct request_queue *q); -static inline bool blk_throtl_bio(struct bio *bio) +void blk_throtl_cancel_bios(struct gendisk *disk); + +static inline bool blk_should_throtl(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + int rw = bio_data_dir(bio); - /* no need to throttle bps any more if the bio has been throttled */ - if (bio_flagged(bio, BIO_THROTTLED) && - !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) - return false; + /* iops limit is always counted */ + if (tg->has_rules_iops[rw]) + return true; + + if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) + return true; + + return false; +} + +static inline bool blk_throtl_bio(struct bio *bio) +{ - if (!tg->has_rules[bio_data_dir(bio)]) + if (!blk_should_throtl(bio)) return false; return __blk_throtl_bio(bio); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index a9982000b667..246467926253 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -843,6 +843,10 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + + wbt_queue_depth_changed(&rwb->rqos); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); /* * Assign rwb and add the stats callback. @@ -853,11 +857,6 @@ int wbt_init(struct request_queue *q) blk_stat_add_callback(q, rwb->cb); - rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_queue_depth_changed(&rwb->rqos); - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); - return 0; err_free: diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a264621d4905..db829401d8d0 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -63,13 +63,10 @@ bool blk_req_needs_zone_write_lock(struct request *rq) if (!rq->q->disk->seq_zones_wlock) return false; - switch (req_op(rq)) { - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: + if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq))) return blk_rq_zone_is_seq(rq); - default: - return false; - } + + return false; } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); diff --git a/block/blk.h b/block/blk.h index d7142c4d2fef..5350bf363035 100644 --- a/block/blk.h +++ b/block/blk.h @@ -270,8 +270,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, void blk_insert_flush(struct request *rq); -int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e); +int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); @@ -389,9 +388,9 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, } #ifdef CONFIG_BLK_CGROUP_IOLATENCY -extern int blk_iolatency_init(struct request_queue *q); +int blk_iolatency_init(struct gendisk *disk); #else -static inline int blk_iolatency_init(struct request_queue *q) { return 0; } +static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; #endif #ifdef CONFIG_BLK_DEV_ZONED diff --git a/block/elevator.c b/block/elevator.c index c319765892bb..bd71f0fc4e4b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -588,7 +588,7 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -int elevator_switch_mq(struct request_queue *q, +static int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e) { int ret; @@ -723,7 +723,7 @@ void elevator_init_mq(struct request_queue *q) * need for the new one. this way we have a chance of going back to the old * one, if the new one fails init for some reason. */ -static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { int err; diff --git a/block/genhd.c b/block/genhd.c index 988ba52fd331..514395361d7c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -627,7 +627,7 @@ void del_gendisk(struct gendisk *disk) blk_mq_freeze_queue_wait(q); - blk_throtl_cancel_bios(disk->queue); + blk_throtl_cancel_bios(disk); blk_sync_queue(q); blk_flush_integrity(); @@ -1151,7 +1151,8 @@ static void disk_release(struct device *dev) !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); - blkcg_exit_queue(disk->queue); + blkcg_exit_disk(disk); + bioset_exit(&disk->bio_split); disk_release_events(disk); @@ -1364,7 +1365,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - if (blkcg_init_queue(q)) + if (blkcg_init_disk(disk)) goto out_erase_part0; rand_initialize_disk(disk); diff --git a/block/opal_proto.h b/block/opal_proto.h index b486b3ec7dc4..7152aa1f1a49 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -39,7 +39,12 @@ enum opal_response_token { #define FIRST_TPER_SESSION_NUM 4096 #define TPER_SYNC_SUPPORTED 0x01 +/* FC_LOCKING features */ +#define LOCKING_SUPPORTED_MASK 0x01 +#define LOCKING_ENABLED_MASK 0x02 +#define LOCKED_MASK 0x04 #define MBR_ENABLED_MASK 0x10 +#define MBR_DONE_MASK 0x20 #define TINY_ATOM_DATA_MASK 0x3F #define TINY_ATOM_SIGNED 0x40 diff --git a/block/sed-opal.c b/block/sed-opal.c index 9700197000f2..2c5327a0543a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -74,8 +74,7 @@ struct parsed_resp { }; struct opal_dev { - bool supported; - bool mbr_enabled; + u32 flags; void *data; sec_send_recv *send_recv; @@ -280,6 +279,30 @@ static bool check_tper(const void *data) return true; } +static bool check_lcksuppt(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_SUPPORTED_MASK); +} + +static bool check_lckenabled(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_ENABLED_MASK); +} + +static bool check_locked(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKED_MASK); +} + static bool check_mbrenabled(const void *data) { const struct d0_locking_features *lfeat = data; @@ -288,6 +311,14 @@ static bool check_mbrenabled(const void *data) return !!(sup_feat & MBR_ENABLED_MASK); } +static bool check_mbrdone(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & MBR_DONE_MASK); +} + static bool check_sum(const void *data) { const struct d0_single_user_mode *sum = data; @@ -435,7 +466,7 @@ static int opal_discovery0_end(struct opal_dev *dev) u32 hlen = be32_to_cpu(hdr->length); print_buffer(dev->resp, hlen); - dev->mbr_enabled = false; + dev->flags &= OPAL_FL_SUPPORTED; if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", @@ -461,7 +492,16 @@ static int opal_discovery0_end(struct opal_dev *dev) check_geometry(dev, body); break; case FC_LOCKING: - dev->mbr_enabled = check_mbrenabled(body->features); + if (check_lcksuppt(body->features)) + dev->flags |= OPAL_FL_LOCKING_SUPPORTED; + if (check_lckenabled(body->features)) + dev->flags |= OPAL_FL_LOCKING_ENABLED; + if (check_locked(body->features)) + dev->flags |= OPAL_FL_LOCKED; + if (check_mbrenabled(body->features)) + dev->flags |= OPAL_FL_MBR_ENABLED; + if (check_mbrdone(body->features)) + dev->flags |= OPAL_FL_MBR_DONE; break; case FC_ENTERPRISE: case FC_DATASTORE: @@ -2109,7 +2149,8 @@ static int check_opal_support(struct opal_dev *dev) mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = opal_discovery0_step(dev); - dev->supported = !ret; + if (!ret) + dev->flags |= OPAL_FL_SUPPORTED; mutex_unlock(&dev->dev_lock); return ret; @@ -2148,6 +2189,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) INIT_LIST_HEAD(&dev->unlk_lst); mutex_init(&dev->dev_lock); + dev->flags = 0; dev->data = data; dev->send_recv = send_recv; if (check_opal_support(dev) != 0) { @@ -2528,7 +2570,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) if (!dev) return false; - if (!dev->supported) + if (!(dev->flags & OPAL_FL_SUPPORTED)) return false; mutex_lock(&dev->dev_lock); @@ -2546,7 +2588,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) was_failure = true; } - if (dev->mbr_enabled) { + if (dev->flags & OPAL_FL_MBR_ENABLED) { ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); if (ret) pr_debug("Failed to set MBR Done in S3 resume\n"); @@ -2620,6 +2662,23 @@ static int opal_generic_read_write_table(struct opal_dev *dev, return ret; } +static int opal_get_status(struct opal_dev *dev, void __user *data) +{ + struct opal_status sts = {0}; + + /* + * check_opal_support() error is not fatal, + * !dev->supported is a valid condition + */ + if (!check_opal_support(dev)) + sts.flags = dev->flags; + if (copy_to_user(data, &sts, sizeof(sts))) { + pr_debug("Error copying status to userspace\n"); + return -EFAULT; + } + return 0; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2629,12 +2688,14 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) return -EACCES; if (!dev) return -ENOTSUPP; - if (!dev->supported) + if (!(dev->flags & OPAL_FL_SUPPORTED)) return -ENOTSUPP; - p = memdup_user(arg, _IOC_SIZE(cmd)); - if (IS_ERR(p)) - return PTR_ERR(p); + if (cmd & IOC_IN) { + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); + } switch (cmd) { case IOC_OPAL_SAVE: @@ -2685,11 +2746,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GENERIC_TABLE_RW: ret = opal_generic_read_write_table(dev, p); break; + case IOC_OPAL_GET_STATUS: + ret = opal_get_status(dev, arg); + break; default: break; } - kfree(p); + if (cmd & IOC_IN) + kfree(p); return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 12b3ca8f6f4a..128722cf6c3c 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -108,7 +108,7 @@ static ssize_t aoedisk_show_payload(struct device *dev, return sysfs_emit(page, "%lu\n", d->maxbcnt); } -static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) +static int aoe_debugfs_show(struct seq_file *s, void *ignored) { struct aoedev *d; struct aoetgt **t, **te; @@ -151,11 +151,7 @@ static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) return 0; } - -static int aoe_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, aoedisk_debugfs_show, inode->i_private); -} +DEFINE_SHOW_ATTRIBUTE(aoe_debugfs); static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL); static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL); @@ -184,13 +180,6 @@ static const struct attribute_group *aoe_attr_groups[] = { NULL, }; -static const struct file_operations aoe_debugfs_fops = { - .open = aoe_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static void aoedisk_add_debugfs(struct aoedev *d) { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 859499cd1ff8..20acc4a1fd6d 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -397,7 +397,7 @@ static int brd_alloc(int i) disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; - strlcpy(disk->disk_name, buf, DISK_NAME_LEN); + strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); /* diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f15f2f041596..4d661282ff41 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1529,7 +1529,6 @@ extern int w_send_read_req(struct drbd_work *, int); extern int w_e_reissue(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); -extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(struct timer_list *t); extern void start_resync_timer_fn(struct timer_list *t); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 013d355a2033..864c98e74875 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -4752,7 +4752,7 @@ void notify_helper(enum drbd_notification_type type, struct drbd_genlmsghdr *dh; int err; - strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); + strscpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); helper_info.helper_status = status; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index af4c7d65490b..c897c4572036 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2113,9 +2113,6 @@ static int receive_DataReply(struct drbd_connection *connection, struct packet_i if (unlikely(!req)) return -EIO; - /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid - * special casing it there for the various failure cases. - * still no race with drbd_fail_pending_reads */ err = recv_dless_read(peer_device, req, sector, pi->size); if (!err) req_mod(req, DATA_RECEIVED); diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 511f39a08de4..6237fa1dcb0e 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -266,8 +266,6 @@ struct bio_and_error { extern void start_new_tl_epoch(struct drbd_connection *connection); extern void drbd_req_destroy(struct kref *kref); -extern void _req_may_be_done(struct drbd_request *req, - struct bio_and_error *m); extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m); extern void complete_master_bio(struct drbd_device *device, diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 562725d222a7..815d77ba6381 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1397,15 +1397,15 @@ static void mtip_dump_identify(struct mtip_port *port) if (!port->identify_valid) return; - strlcpy(cbuf, (char *)(port->identify+10), 21); + strscpy(cbuf, (char *)(port->identify + 10), 21); dev_info(&port->dd->pdev->dev, "Serial No.: %s\n", cbuf); - strlcpy(cbuf, (char *)(port->identify+23), 9); + strscpy(cbuf, (char *)(port->identify + 23), 9); dev_info(&port->dd->pdev->dev, "Firmware Ver.: %s\n", cbuf); - strlcpy(cbuf, (char *)(port->identify+27), 41); + strscpy(cbuf, (char *)(port->identify + 27), 41); dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", @@ -1421,13 +1421,13 @@ static void mtip_dump_identify(struct mtip_port *port) pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid); switch (revid & 0xFF) { case 0x1: - strlcpy(cbuf, "A0", 3); + strscpy(cbuf, "A0", 3); break; case 0x3: - strlcpy(cbuf, "A2", 3); + strscpy(cbuf, "A2", 3); break; default: - strlcpy(cbuf, "?", 2); + strscpy(cbuf, "?", 2); break; } dev_info(&port->dd->pdev->dev, diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6cec9ce23fd3..4762a03e1ffe 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1413,10 +1413,12 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd) mutex_unlock(&nbd->config_lock); ret = wait_event_interruptible(config->recv_wq, atomic_read(&config->recv_threads) == 0); - if (ret) + if (ret) { sock_shutdown(nbd); - flush_workqueue(nbd->recv_workq); + nbd_clear_que(nbd); + } + flush_workqueue(nbd->recv_workq); mutex_lock(&nbd->config_lock); nbd_bdev_reset(nbd); /* user requested, ignore socket errors */ diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index c451c477978f..1f154f92f4c2 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1528,7 +1528,7 @@ static bool should_requeue_request(struct request *rq) return false; } -static int null_map_queues(struct blk_mq_tag_set *set) +static void null_map_queues(struct blk_mq_tag_set *set) { struct nullb *nullb = set->driver_data; int i, qoff; @@ -1555,7 +1555,9 @@ static int null_map_queues(struct blk_mq_tag_set *set) } else { pr_warn("tag set has unexpected nr_hw_queues: %d\n", set->nr_hw_queues); - return -EINVAL; + WARN_ON_ONCE(true); + submit_queues = 1; + poll_queues = 0; } } @@ -1577,8 +1579,6 @@ static int null_map_queues(struct blk_mq_tag_set *set) qoff += map->nr_queues; blk_mq_map_queues(map); } - - return 0; } static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index e1d080f680ed..c76e0148eada 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -745,7 +745,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) gendisk->flags |= GENHD_FL_NO_PART; gendisk->fops = &ps3vram_fops; gendisk->private_data = dev; - strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); + strscpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS); blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE); diff --git a/drivers/block/rnbd/Makefile b/drivers/block/rnbd/Makefile index 5bb1a7ad1ada..40b31630822c 100644 --- a/drivers/block/rnbd/Makefile +++ b/drivers/block/rnbd/Makefile @@ -6,10 +6,12 @@ rnbd-client-y := rnbd-clt.o \ rnbd-clt-sysfs.o \ rnbd-common.o +CFLAGS_rnbd-srv-trace.o = -I$(src) + rnbd-server-y := rnbd-common.o \ rnbd-srv.o \ - rnbd-srv-dev.o \ - rnbd-srv-sysfs.o + rnbd-srv-sysfs.o \ + rnbd-srv-trace.o obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 04da33a22ef4..78334da74d8b 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1159,13 +1159,11 @@ static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct rnbd_queue *q = hctx->driver_data; struct rnbd_clt_dev *dev = q->dev; - int cnt; - cnt = rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); - return cnt; + return rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); } -static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) +static void rnbd_rdma_map_queues(struct blk_mq_tag_set *set) { struct rnbd_clt_session *sess = set->driver_data; @@ -1194,8 +1192,6 @@ static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) set->map[HCTX_TYPE_DEFAULT].nr_queues, set->map[HCTX_TYPE_READ].nr_queues); } - - return 0; } static struct blk_mq_ops rnbd_mq_ops = { diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c deleted file mode 100644 index c63017f6e421..000000000000 --- a/drivers/block/rnbd/rnbd-srv-dev.c +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * RDMA Network Block Driver - * - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. - */ -#undef pr_fmt -#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt - -#include "rnbd-srv-dev.h" -#include "rnbd-log.h" - -struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags) -{ - struct rnbd_dev *dev; - int ret; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return ERR_PTR(-ENOMEM); - - dev->blk_open_flags = flags; - dev->bdev = blkdev_get_by_path(path, flags, THIS_MODULE); - ret = PTR_ERR_OR_ZERO(dev->bdev); - if (ret) - goto err; - - dev->blk_open_flags = flags; - - return dev; - -err: - kfree(dev); - return ERR_PTR(ret); -} - -void rnbd_dev_close(struct rnbd_dev *dev) -{ - blkdev_put(dev->bdev, dev->blk_open_flags); - kfree(dev); -} diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h deleted file mode 100644 index 8407d12f70af..000000000000 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * RDMA Network Block Driver - * - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. - */ -#ifndef RNBD_SRV_DEV_H -#define RNBD_SRV_DEV_H - -#include <linux/fs.h> -#include "rnbd-proto.h" - -struct rnbd_dev { - struct block_device *bdev; - fmode_t blk_open_flags; -}; - -/** - * rnbd_dev_open() - Open a device - * @path: path to open - * @flags: open flags - */ -struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags); - -/** - * rnbd_dev_close() - Close a device - */ -void rnbd_dev_close(struct rnbd_dev *dev); - -void rnbd_endio(void *priv, int error); - -static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev) -{ - return queue_max_segments(bdev_get_queue(dev->bdev)); -} - -static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev) -{ - return queue_max_hw_sectors(bdev_get_queue(dev->bdev)); -} - -static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev) -{ - return bdev_max_secure_erase_sectors(dev->bdev); -} - -static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) -{ - return bdev_max_discard_sectors(dev->bdev); -} - -static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) -{ - return bdev_get_queue(dev->bdev)->limits.discard_granularity; -} - -static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) -{ - return bdev_discard_alignment(dev->bdev); -} - -#endif /* RNBD_SRV_DEV_H */ diff --git a/drivers/block/rnbd/rnbd-srv-trace.c b/drivers/block/rnbd/rnbd-srv-trace.c new file mode 100644 index 000000000000..30f0895c18f5 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-trace.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. + */ +#include "rtrs.h" +#include "rtrs-srv.h" +#include "rnbd-srv.h" +#include "rnbd-proto.h" + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "rnbd-srv-trace.h" diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h new file mode 100644 index 000000000000..8dedf73bdd28 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rnbd_srv + +#if !defined(_TRACE_RNBD_SRV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RNBD_SRV_H + +#include <linux/tracepoint.h> + +struct rnbd_srv_session; +struct rtrs_srv_op; + +DECLARE_EVENT_CLASS(rnbd_srv_link_class, + TP_PROTO(struct rnbd_srv_session *srv), + + TP_ARGS(srv), + + TP_STRUCT__entry( + __field(int, qdepth) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->qdepth = srv->queue_depth; + __assign_str(sessname, srv->sessname); + ), + + TP_printk("sessname: %s qdepth: %d", + __get_str(sessname), + __entry->qdepth + ) +); + +#define DEFINE_LINK_EVENT(name) \ +DEFINE_EVENT(rnbd_srv_link_class, name, \ + TP_PROTO(struct rnbd_srv_session *srv), \ + TP_ARGS(srv)) + +DEFINE_LINK_EVENT(create_sess); +DEFINE_LINK_EVENT(destroy_sess); + +TRACE_DEFINE_ENUM(RNBD_OP_READ); +TRACE_DEFINE_ENUM(RNBD_OP_WRITE); +TRACE_DEFINE_ENUM(RNBD_OP_FLUSH); +TRACE_DEFINE_ENUM(RNBD_OP_DISCARD); +TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE); +TRACE_DEFINE_ENUM(RNBD_F_SYNC); +TRACE_DEFINE_ENUM(RNBD_F_FUA); + +#define show_rnbd_rw_flags(x) \ + __print_flags(x, "|", \ + { RNBD_OP_READ, "READ" }, \ + { RNBD_OP_WRITE, "WRITE" }, \ + { RNBD_OP_FLUSH, "FLUSH" }, \ + { RNBD_OP_DISCARD, "DISCARD" }, \ + { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \ + { RNBD_F_SYNC, "SYNC" }, \ + { RNBD_F_FUA, "FUA" }) + +TRACE_EVENT(process_rdma, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_io *msg, + struct rtrs_srv_op *id, + u32 datalen, + size_t usrlen), + + TP_ARGS(srv, msg, id, datalen, usrlen), + + TP_STRUCT__entry( + __string(sessname, srv->sessname) + __field(u8, dir) + __field(u8, ver) + __field(u32, device_id) + __field(u64, sector) + __field(u32, flags) + __field(u32, bi_size) + __field(u16, ioprio) + __field(u32, datalen) + __field(size_t, usrlen) + ), + + TP_fast_assign( + __assign_str(sessname, srv->sessname); + __entry->dir = id->dir; + __entry->ver = srv->ver; + __entry->device_id = le32_to_cpu(msg->device_id); + __entry->sector = le64_to_cpu(msg->sector); + __entry->bi_size = le32_to_cpu(msg->bi_size); + __entry->flags = le32_to_cpu(msg->rw); + __entry->ioprio = le16_to_cpu(msg->prio); + __entry->datalen = datalen; + __entry->usrlen = usrlen; + ), + + TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu", + __get_str(sessname), + __print_symbolic(__entry->dir, + { READ, "READ" }, + { WRITE, "WRITE" }), + __entry->ver, + __entry->device_id, + __entry->sector, + __entry->bi_size, + show_rnbd_rw_flags(__entry->flags), + __entry->ioprio, + __entry->datalen, + __entry->usrlen + ) +); + +TRACE_EVENT(process_msg_sess_info, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_sess_info *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u8, proto_ver) + __field(u8, clt_ver) + __field(u8, srv_ver) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->proto_ver = srv->ver; + __entry->clt_ver = msg->ver; + __entry->srv_ver = RNBD_PROTO_VER_MAJOR; + __assign_str(sessname, srv->sessname); + ), + + TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)", + __get_str(sessname), + __entry->proto_ver, + __entry->clt_ver, + __entry->srv_ver + ) +); + +TRACE_DEFINE_ENUM(RNBD_ACCESS_RO); +TRACE_DEFINE_ENUM(RNBD_ACCESS_RW); +TRACE_DEFINE_ENUM(RNBD_ACCESS_MIGRATION); + +#define show_rnbd_access_mode(x) \ + __print_symbolic(x, \ + { RNBD_ACCESS_RO, "RO" }, \ + { RNBD_ACCESS_RW, "RW" }, \ + { RNBD_ACCESS_MIGRATION, "MIGRATION" }) + +TRACE_EVENT(process_msg_open, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_open *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u8, access_mode) + __string(sessname, srv->sessname) + __string(dev_name, msg->dev_name) + ), + + TP_fast_assign( + __entry->access_mode = msg->access_mode; + __assign_str(sessname, srv->sessname); + __assign_str(dev_name, msg->dev_name); + ), + + TP_printk("Open message received: session='%s' path='%s' access_mode=%s", + __get_str(sessname), + __get_str(dev_name), + show_rnbd_access_mode(__entry->access_mode) + ) +); + +TRACE_EVENT(process_msg_close, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_close *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u32, device_id) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->device_id = le32_to_cpu(msg->device_id); + __assign_str(sessname, srv->sessname); + ), + + TP_printk("Close message received: session='%s' device id='%d'", + __get_str(sessname), + __entry->device_id + ) +); + +#endif /* _TRACE_RNBD_SRV_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rnbd-srv-trace +#include <trace/define_trace.h> + diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 5e08da277ddf..08b041159cd3 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -13,7 +13,7 @@ #include <linux/blkdev.h> #include "rnbd-srv.h" -#include "rnbd-srv-dev.h" +#include "rnbd-srv-trace.h" MODULE_DESCRIPTION("RDMA Network Block Device Server"); MODULE_LICENSE("GPL"); @@ -84,18 +84,6 @@ static inline void rnbd_put_sess_dev(struct rnbd_srv_sess_dev *sess_dev) kref_put(&sess_dev->kref, rnbd_sess_dev_release); } -void rnbd_endio(void *priv, int error) -{ - struct rnbd_io_private *rnbd_priv = priv; - struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; - - rnbd_put_sess_dev(sess_dev); - - rtrs_srv_resp_rdma(rnbd_priv->id, error); - - kfree(priv); -} - static struct rnbd_srv_sess_dev * rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) { @@ -116,7 +104,13 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) static void rnbd_dev_bi_end_io(struct bio *bio) { - rnbd_endio(bio->bi_private, blk_status_to_errno(bio->bi_status)); + struct rnbd_io_private *rnbd_priv = bio->bi_private; + struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; + + rnbd_put_sess_dev(sess_dev); + rtrs_srv_resp_rdma(rnbd_priv->id, blk_status_to_errno(bio->bi_status)); + + kfree(rnbd_priv); bio_put(bio); } @@ -132,6 +126,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, struct bio *bio; short prio; + trace_process_rdma(srv_sess, msg, id, datalen, usrlen); + priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -149,7 +145,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - bio = bio_alloc(sess_dev->rnbd_dev->bdev, 1, + bio = bio_alloc(sess_dev->bdev, 1, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); if (bio_add_page(bio, virt_to_page(data), datalen, offset_in_page(data)) != datalen) { @@ -223,7 +219,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ - rnbd_dev_close(sess_dev->rnbd_dev); + blkdev_put(sess_dev->bdev, sess_dev->open_flags); mutex_lock(&sess_dev->dev->lock); list_del(&sess_dev->dev_list); if (sess_dev->open_flags & FMODE_WRITE) @@ -244,6 +240,8 @@ static void destroy_sess(struct rnbd_srv_session *srv_sess) if (xa_empty(&srv_sess->index_idr)) goto out; + trace_destroy_sess(srv_sess); + mutex_lock(&srv_sess->lock); xa_for_each(&srv_sess->index_idr, index, sess_dev) rnbd_srv_destroy_dev_session_sysfs(sess_dev); @@ -290,6 +288,8 @@ static int create_sess(struct rtrs_srv_sess *rtrs) rtrs_srv_set_sess_priv(rtrs, srv_sess); + trace_create_sess(srv_sess); + return 0; } @@ -332,23 +332,24 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, mutex_unlock(&sess->lock); } -static int process_msg_close(struct rnbd_srv_session *srv_sess, +static void process_msg_close(struct rnbd_srv_session *srv_sess, void *data, size_t datalen, const void *usr, size_t usrlen) { const struct rnbd_msg_close *close_msg = usr; struct rnbd_srv_sess_dev *sess_dev; + trace_process_msg_close(srv_sess, close_msg); + sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id), srv_sess); if (IS_ERR(sess_dev)) - return 0; + return; rnbd_put_sess_dev(sess_dev); mutex_lock(&srv_sess->lock); rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&srv_sess->lock); - return 0; } static int process_msg_open(struct rnbd_srv_session *srv_sess, @@ -378,7 +379,7 @@ static int rnbd_srv_rdma_ev(void *priv, case RNBD_MSG_IO: return process_rdma(srv_sess, id, data, datalen, usr, usrlen); case RNBD_MSG_CLOSE: - ret = process_msg_close(srv_sess, data, datalen, usr, usrlen); + process_msg_close(srv_sess, data, datalen, usr, usrlen); break; case RNBD_MSG_OPEN: ret = process_msg_open(srv_sess, usr, usrlen, data, datalen); @@ -393,6 +394,11 @@ static int rnbd_srv_rdma_ev(void *priv, return -EINVAL; } + /* + * Since ret is passed to rtrs to handle the failure case, we + * just return 0 at the end otherwise callers in rtrs would call + * send_io_resp_imm again to print redundant err message. + */ rtrs_srv_resp_rdma(id, ret); return 0; } @@ -504,14 +510,14 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, } static struct rnbd_srv_dev * -rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, +rnbd_srv_get_or_create_srv_dev(struct block_device *bdev, struct rnbd_srv_session *srv_sess, enum rnbd_access_mode access_mode) { int ret; struct rnbd_srv_dev *new_dev, *dev; - new_dev = rnbd_srv_init_srv_dev(rnbd_dev->bdev); + new_dev = rnbd_srv_init_srv_dev(bdev); if (IS_ERR(new_dev)) return new_dev; @@ -531,41 +537,32 @@ rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, struct rnbd_srv_sess_dev *sess_dev) { - struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; + struct block_device *bdev = sess_dev->bdev; rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); - rsp->device_id = - cpu_to_le32(sess_dev->device_id); - rsp->nsectors = - cpu_to_le64(get_capacity(rnbd_dev->bdev->bd_disk)); - rsp->logical_block_size = - cpu_to_le16(bdev_logical_block_size(rnbd_dev->bdev)); - rsp->physical_block_size = - cpu_to_le16(bdev_physical_block_size(rnbd_dev->bdev)); - rsp->max_segments = - cpu_to_le16(rnbd_dev_get_max_segs(rnbd_dev)); + rsp->device_id = cpu_to_le32(sess_dev->device_id); + rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev)); + rsp->logical_block_size = cpu_to_le16(bdev_logical_block_size(bdev)); + rsp->physical_block_size = cpu_to_le16(bdev_physical_block_size(bdev)); + rsp->max_segments = cpu_to_le16(bdev_max_segments(bdev)); rsp->max_hw_sectors = - cpu_to_le32(rnbd_dev_get_max_hw_sects(rnbd_dev)); + cpu_to_le32(queue_max_hw_sectors(bdev_get_queue(bdev))); rsp->max_write_same_sectors = 0; - rsp->max_discard_sectors = - cpu_to_le32(rnbd_dev_get_max_discard_sects(rnbd_dev)); - rsp->discard_granularity = - cpu_to_le32(rnbd_dev_get_discard_granularity(rnbd_dev)); - rsp->discard_alignment = - cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); - rsp->secure_discard = - cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); + rsp->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev)); + rsp->discard_granularity = cpu_to_le32(bdev_discard_granularity(bdev)); + rsp->discard_alignment = cpu_to_le32(bdev_discard_alignment(bdev)); + rsp->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev)); rsp->cache_policy = 0; - if (bdev_write_cache(rnbd_dev->bdev)) + if (bdev_write_cache(bdev)) rsp->cache_policy |= RNBD_WRITEBACK; - if (bdev_fua(rnbd_dev->bdev)) + if (bdev_fua(bdev)) rsp->cache_policy |= RNBD_FUA; } static struct rnbd_srv_sess_dev * rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, const struct rnbd_msg_open *open_msg, - struct rnbd_dev *rnbd_dev, fmode_t open_flags, + struct block_device *bdev, fmode_t open_flags, struct rnbd_srv_dev *srv_dev) { struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); @@ -577,7 +574,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); - sdev->rnbd_dev = rnbd_dev; + sdev->bdev = bdev; sdev->sess = srv_sess; sdev->dev = srv_dev; sdev->open_flags = open_flags; @@ -643,9 +640,8 @@ static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, struct rnbd_msg_sess_info_rsp *rsp = data; srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); - pr_debug("Session %s using protocol version %d (client version: %d, server version: %d)\n", - srv_sess->sessname, srv_sess->ver, - sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); + + trace_process_msg_sess_info(srv_sess, sess_info_msg); rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->ver = srv_sess->ver; @@ -685,14 +681,13 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, struct rnbd_srv_dev *srv_dev; struct rnbd_srv_sess_dev *srv_sess_dev; const struct rnbd_msg_open *open_msg = msg; + struct block_device *bdev; fmode_t open_flags; char *full_path; - struct rnbd_dev *rnbd_dev; struct rnbd_msg_open_rsp *rsp = data; - pr_debug("Open message received: session='%s' path='%s' access_mode=%d\n", - srv_sess->sessname, open_msg->dev_name, - open_msg->access_mode); + trace_process_msg_open(srv_sess, open_msg); + open_flags = FMODE_READ; if (open_msg->access_mode != RNBD_ACCESS_RO) open_flags |= FMODE_WRITE; @@ -725,25 +720,25 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, goto reject; } - rnbd_dev = rnbd_dev_open(full_path, open_flags); - if (IS_ERR(rnbd_dev)) { - pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n", - full_path, srv_sess->sessname, PTR_ERR(rnbd_dev)); - ret = PTR_ERR(rnbd_dev); + bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n", + full_path, srv_sess->sessname, ret); goto free_path; } - srv_dev = rnbd_srv_get_or_create_srv_dev(rnbd_dev, srv_sess, + srv_dev = rnbd_srv_get_or_create_srv_dev(bdev, srv_sess, open_msg->access_mode); if (IS_ERR(srv_dev)) { pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n", full_path, srv_sess->sessname, PTR_ERR(srv_dev)); ret = PTR_ERR(srv_dev); - goto rnbd_dev_close; + goto blkdev_put; } srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, - rnbd_dev, open_flags, + bdev, open_flags, srv_dev); if (IS_ERR(srv_sess_dev)) { pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", @@ -758,7 +753,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, */ mutex_lock(&srv_dev->lock); if (!srv_dev->dev_kobj.state_in_sysfs) { - ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev); + ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev); if (ret) { mutex_unlock(&srv_dev->lock); rnbd_srv_err(srv_sess_dev, @@ -800,8 +795,8 @@ srv_dev_put: mutex_unlock(&srv_dev->lock); } rnbd_put_srv_dev(srv_dev); -rnbd_dev_close: - rnbd_dev_close(rnbd_dev); +blkdev_put: + blkdev_put(bdev, open_flags); free_path: kfree(full_path); reject: diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 081bceaf4ae9..f5962fd31d62 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -46,7 +46,7 @@ struct rnbd_srv_dev { struct rnbd_srv_sess_dev { /* Entry inside rnbd_srv_dev struct */ struct list_head dev_list; - struct rnbd_dev *rnbd_dev; + struct block_device *bdev; struct rnbd_srv_session *sess; struct rnbd_srv_dev *dev; struct kobject kobj; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6a4a94b4cdf4..2651bf41dde3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -49,7 +49,9 @@ /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ - | UBLK_F_NEED_GET_DATA) + | UBLK_F_NEED_GET_DATA \ + | UBLK_F_USER_RECOVERY \ + | UBLK_F_USER_RECOVERY_REISSUE) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) @@ -119,7 +121,7 @@ struct ublk_queue { unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; - bool abort_work_pending; + bool force_abort; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; struct ublk_io ios[0]; @@ -161,6 +163,7 @@ struct ublk_device { * monitor each queue's daemon periodically */ struct delayed_work monitor_work; + struct work_struct quiesce_work; struct work_struct stop_work; }; @@ -323,6 +326,30 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) PAGE_SIZE); } +static inline bool ublk_queue_can_use_recovery_reissue( + struct ublk_queue *ubq) +{ + if ((ubq->flags & UBLK_F_USER_RECOVERY) && + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) + return true; + return false; +} + +static inline bool ublk_queue_can_use_recovery( + struct ublk_queue *ubq) +{ + if (ubq->flags & UBLK_F_USER_RECOVERY) + return true; + return false; +} + +static inline bool ublk_can_use_recovery(struct ublk_device *ub) +{ + if (ub->dev_info.flags & UBLK_F_USER_RECOVERY) + return true; + return false; +} + static void ublk_free_disk(struct gendisk *disk) { struct ublk_device *ub = disk->private_data; @@ -612,13 +639,17 @@ static void ublk_complete_rq(struct request *req) * Also aborting may not be started yet, keep in mind that one failed * request may be issued by block layer again. */ -static void __ublk_fail_req(struct ublk_io *io, struct request *req) +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { io->flags |= UBLK_IO_FLAG_ABORTED; - blk_mq_end_request(req, BLK_STS_IOERR); + if (ublk_queue_can_use_recovery_reissue(ubq)) + blk_mq_requeue_request(req, false); + else + blk_mq_end_request(req, BLK_STS_IOERR); } } @@ -639,22 +670,40 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res) #define UBLK_REQUEUE_DELAY_MS 3 +static inline void __ublk_abort_rq(struct ublk_queue *ubq, + struct request *rq) +{ + /* We cannot process this rq so just requeue it. */ + if (ublk_queue_can_use_recovery(ubq)) + blk_mq_requeue_request(rq, false); + else + blk_mq_end_request(rq, BLK_STS_IOERR); + + mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); +} + static inline void __ublk_rq_task_work(struct request *req) { struct ublk_queue *ubq = req->mq_hctx->driver_data; - struct ublk_device *ub = ubq->dev; int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; - bool task_exiting = current != ubq->ubq_daemon || ubq_daemon_is_dying(ubq); unsigned int mapped_bytes; pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, ublk_get_iod(ubq, req->tag)->addr); - if (unlikely(task_exiting)) { - blk_mq_end_request(req, BLK_STS_IOERR); - mod_delayed_work(system_wq, &ub->monitor_work, 0); + /* + * Task is exiting if either: + * + * (1) current != ubq_daemon. + * io_uring_cmd_complete_in_task() tries to run task_work + * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. + * + * (2) current->flags & PF_EXITING. + */ + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { + __ublk_abort_rq(ubq, req); return; } @@ -739,13 +788,24 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) return BLK_STS_IOERR; + /* With recovery feature enabled, force_abort is set in + * ublk_stop_dev() before calling del_gendisk(). We have to + * abort all requeued and new rqs here to let del_gendisk() + * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() + * to avoid UAF on io_uring ctx. + * + * Note: force_abort is guaranteed to be seen because it is set + * before request queue is unqiuesced. + */ + if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) + return BLK_STS_IOERR; blk_mq_start_request(bd->rq); if (unlikely(ubq_daemon_is_dying(ubq))) { fail: - mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); - return BLK_STS_IOERR; + __ublk_abort_rq(ubq, rq); + return BLK_STS_OK; } if (ublk_can_use_task_work(ubq)) { @@ -916,7 +976,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); if (rq) - __ublk_fail_req(io, rq); + __ublk_fail_req(ubq, io, rq); } } ublk_put_device(ub); @@ -932,7 +992,10 @@ static void ublk_daemon_monitor_work(struct work_struct *work) struct ublk_queue *ubq = ublk_get_queue(ub, i); if (ubq_daemon_is_dying(ubq)) { - schedule_work(&ub->stop_work); + if (ublk_queue_can_use_recovery(ubq)) + schedule_work(&ub->quiesce_work); + else + schedule_work(&ub->stop_work); /* abort queue is for making forward progress */ ublk_abort_queue(ub, ubq); @@ -940,12 +1003,13 @@ static void ublk_daemon_monitor_work(struct work_struct *work) } /* - * We can't schedule monitor work after ublk_remove() is started. + * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE. + * after ublk_remove() or __ublk_quiesce_dev() is started. * * No need ub->mutex, monitor work are canceled after state is marked - * as DEAD, so DEAD state is observed reliably. + * as not LIVE, so new state is observed reliably. */ - if (ub->dev_info.state != UBLK_S_DEV_DEAD) + if (ub->dev_info.state == UBLK_S_DEV_LIVE) schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); } @@ -982,12 +1046,97 @@ static void ublk_cancel_dev(struct ublk_device *ub) ublk_cancel_queue(ublk_get_queue(ub, i)); } -static void ublk_stop_dev(struct ublk_device *ub) +static bool ublk_check_inflight_rq(struct request *rq, void *data) +{ + bool *idle = data; + + if (blk_mq_request_started(rq)) { + *idle = false; + return false; + } + return true; +} + +static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) { + bool idle; + + WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); + while (true) { + idle = true; + blk_mq_tagset_busy_iter(&ub->tag_set, + ublk_check_inflight_rq, &idle); + if (idle) + break; + msleep(UBLK_REQUEUE_DELAY_MS); + } +} + +static void __ublk_quiesce_dev(struct ublk_device *ub) +{ + pr_devel("%s: quiesce ub: dev_id %d state %s\n", + __func__, ub->dev_info.dev_id, + ub->dev_info.state == UBLK_S_DEV_LIVE ? + "LIVE" : "QUIESCED"); + blk_mq_quiesce_queue(ub->ub_disk->queue); + ublk_wait_tagset_rqs_idle(ub); + ub->dev_info.state = UBLK_S_DEV_QUIESCED; + ublk_cancel_dev(ub); + /* we are going to release task_struct of ubq_daemon and resets + * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. + * Besides, monitor_work is not necessary in QUIESCED state since we have + * already scheduled quiesce_work and quiesced all ubqs. + * + * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel + * it here and re-schedule it in END_USER_RECOVERY to avoid UAF. + */ + cancel_delayed_work_sync(&ub->monitor_work); +} + +static void ublk_quiesce_work_fn(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, quiesce_work); + mutex_lock(&ub->mutex); if (ub->dev_info.state != UBLK_S_DEV_LIVE) goto unlock; + __ublk_quiesce_dev(ub); + unlock: + mutex_unlock(&ub->mutex); +} + +static void ublk_unquiesce_dev(struct ublk_device *ub) +{ + int i; + + pr_devel("%s: unquiesce ub: dev_id %d state %s\n", + __func__, ub->dev_info.dev_id, + ub->dev_info.state == UBLK_S_DEV_LIVE ? + "LIVE" : "QUIESCED"); + /* quiesce_work has run. We let requeued rqs be aborted + * before running fallback_wq. "force_abort" must be seen + * after request queue is unqiuesced. Then del_gendisk() + * can move on. + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->force_abort = true; + + blk_mq_unquiesce_queue(ub->ub_disk->queue); + /* We may have requeued some rqs in ublk_quiesce_queue() */ + blk_mq_kick_requeue_list(ub->ub_disk->queue); +} +static void ublk_stop_dev(struct ublk_device *ub) +{ + mutex_lock(&ub->mutex); + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + goto unlock; + if (ublk_can_use_recovery(ub)) { + if (ub->dev_info.state == UBLK_S_DEV_LIVE) + __ublk_quiesce_dev(ub); + ublk_unquiesce_dev(ub); + } del_gendisk(ub->ub_disk); ub->dev_info.state = UBLK_S_DEV_DEAD; ub->dev_info.ublksrv_pid = -1; @@ -1311,6 +1460,7 @@ static void ublk_remove(struct ublk_device *ub) { ublk_stop_dev(ub); cancel_work_sync(&ub->stop_work); + cancel_work_sync(&ub->quiesce_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); put_device(&ub->cdev_dev); } @@ -1487,6 +1637,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->mm_lock); + INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); INIT_WORK(&ub->stop_work, ublk_stop_work_fn); INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); @@ -1607,6 +1758,7 @@ static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) ublk_stop_dev(ub); cancel_work_sync(&ub->stop_work); + cancel_work_sync(&ub->quiesce_work); ublk_put_device(ub); return 0; @@ -1709,6 +1861,116 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) return ret; } +static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); + /* All old ioucmds have to be completed */ + WARN_ON_ONCE(ubq->nr_io_ready); + /* old daemon is PF_EXITING, put it now */ + put_task_struct(ubq->ubq_daemon); + /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ + ubq->ubq_daemon = NULL; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + /* forget everything now and be ready for new FETCH_REQ */ + io->flags = 0; + io->cmd = NULL; + io->addr = 0; + } +} + +static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub; + int ret = -EINVAL; + int i; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return ret; + + mutex_lock(&ub->mutex); + if (!ublk_can_use_recovery(ub)) + goto out_unlock; + /* + * START_RECOVERY is only allowd after: + * + * (1) UB_STATE_OPEN is not set, which means the dying process is exited + * and related io_uring ctx is freed so file struct of /dev/ublkcX is + * released. + * + * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: + * (a)has quiesced request queue + * (b)has requeued every inflight rqs whose io_flags is ACTIVE + * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE + * (d)has completed/camceled all ioucmds owned by ther dying process + */ + if (test_bit(UB_STATE_OPEN, &ub->state) || + ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + ret = -EBUSY; + goto out_unlock; + } + pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_queue_reinit(ub, ublk_get_queue(ub, i)); + /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ + ub->mm = NULL; + ub->nr_queues_ready = 0; + init_completion(&ub->completion); + ret = 0; + out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + int ublksrv_pid = (int)header->data[0]; + struct ublk_device *ub; + int ret = -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return ret; + + pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", + __func__, ub->dev_info.nr_hw_queues, header->dev_id); + /* wait until new ubq_daemon sending all FETCH_REQ */ + wait_for_completion_interruptible(&ub->completion); + pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", + __func__, ub->dev_info.nr_hw_queues, header->dev_id); + + mutex_lock(&ub->mutex); + if (!ublk_can_use_recovery(ub)) + goto out_unlock; + + if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + ret = -EBUSY; + goto out_unlock; + } + ub->dev_info.ublksrv_pid = ublksrv_pid; + pr_devel("%s: new ublksrv_pid %d, dev id %d\n", + __func__, ublksrv_pid, header->dev_id); + blk_mq_unquiesce_queue(ub->ub_disk->queue); + pr_devel("%s: queue unquiesced, dev id %d.\n", + __func__, header->dev_id); + blk_mq_kick_requeue_list(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_LIVE; + schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); + ret = 0; + out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -1750,6 +2012,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_SET_PARAMS: ret = ublk_ctrl_set_params(cmd); break; + case UBLK_CMD_START_USER_RECOVERY: + ret = ublk_ctrl_start_recovery(cmd); + break; + case UBLK_CMD_END_USER_RECOVERY: + ret = ublk_ctrl_end_recovery(cmd); + break; default: break; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index dd9a05174726..3f4739d52268 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -801,7 +801,7 @@ static const struct attribute_group *virtblk_attr_groups[] = { NULL, }; -static int virtblk_map_queues(struct blk_mq_tag_set *set) +static void virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; int i, qoff; @@ -826,8 +826,6 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set) else blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); } - - return 0; } static void virtblk_complete_batch(struct io_comp_batch *iob) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 226ea76cc819..e551433cd107 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -499,7 +499,7 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - strlcpy(file_name, buf, PATH_MAX); + strscpy(file_name, buf, PATH_MAX); /* ignore trailing newline */ sz = strlen(file_name); if (sz > 0 && file_name[sz - 1] == '\n') @@ -1031,7 +1031,7 @@ static ssize_t comp_algorithm_store(struct device *dev, char compressor[ARRAY_SIZE(zram->compressor)]; size_t sz; - strlcpy(compressor, buf, sizeof(compressor)); + strscpy(compressor, buf, sizeof(compressor)); /* ignore trailing newline */ sz = strlen(compressor); if (sz > 0 && compressor[sz - 1] == '\n') @@ -1974,7 +1974,7 @@ static int zram_add(void) if (ret) goto out_cleanup_disk; - strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + strscpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2acda9cea0f9..aebb7ef10e63 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -107,7 +107,7 @@ * * BTREE NODES: * - * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and * free smaller than a bucket - so, that's how big our btree nodes are. * * (If buckets are really big we'll only use part of the bucket for a btree node diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 94d38e8a59b3..2bba4d6aaaa2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1264,7 +1264,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, * * Don't worry event 'out' is allocated from mempool, it can * still be swapped here. Because state->pool is a page mempool - * creaated by by mempool_init_page_pool(), which allocates + * created by mempool_init_page_pool(), which allocates * pages by alloc_pages() indeed. */ diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index ca4f435f7216..bd3afc856d53 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -54,7 +54,6 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc); void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, bool hit, bool bypass); -void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d); void bch_mark_sectors_bypassed(struct cache_set *c, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3f0ff3aab6f2..0285b676e983 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -157,6 +157,53 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_target = target; } +static bool idle_counter_exceeded(struct cache_set *c) +{ + int counter, dev_nr; + + /* + * If c->idle_counter is overflow (idel for really long time), + * reset as 0 and not set maximum rate this time for code + * simplicity. + */ + counter = atomic_inc_return(&c->idle_counter); + if (counter <= 0) { + atomic_set(&c->idle_counter, 0); + return false; + } + + dev_nr = atomic_read(&c->attached_dev_nr); + if (dev_nr == 0) + return false; + + /* + * c->idle_counter is increased by writeback thread of all + * attached backing devices, in order to represent a rough + * time period, counter should be divided by dev_nr. + * Otherwise the idle time cannot be larger with more backing + * device attached. + * The following calculation equals to checking + * (counter / dev_nr) < (dev_nr * 6) + */ + if (counter < (dev_nr * dev_nr * 6)) + return false; + + return true; +} + +/* + * Idle_counter is increased every time when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { @@ -167,21 +214,8 @@ static bool set_at_max_writeback_rate(struct cache_set *c, /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; - /* - * Idle_counter is increased everytime when update_writeback_rate() is - * called. If all backing devices attached to the same cache set have - * identical dc->writeback_rate_update_seconds values, it is about 6 - * rounds of update_writeback_rate() on each backing device before - * c->at_max_writeback_rate is set to 1, and then max wrteback rate set - * to each dc->writeback_rate.rate. - * In order to avoid extra locking cost for counting exact dirty cached - * devices number, c->attached_dev_nr is used to calculate the idle - * throushold. It might be bigger if not all cached device are in write- - * back mode, but it still works well with limited extra rounds of - * update_writeback_rate(). - */ - if (atomic_inc_return(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) + + if (!idle_counter_exceeded(c)) return false; if (atomic_read(&c->at_max_writeback_rate) != 1) @@ -195,13 +229,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c, dc->writeback_rate_change = 0; /* - * Check c->idle_counter and c->at_max_writeback_rate agagain in case - * new I/O arrives during before set_at_max_writeback_rate() returns. - * Then the writeback rate is set to 1, and its new value should be - * decided via __update_writeback_rate(). + * In case new I/O arrives during before + * set_at_max_writeback_rate() returns. */ - if ((atomic_read(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) || + if (!idle_counter_exceeded(c) || !atomic_read(&c->at_max_writeback_rate)) return false; @@ -801,10 +832,9 @@ static int bch_writeback_thread(void *arg) } } - if (dc->writeback_write_wq) { - flush_workqueue(dc->writeback_write_wq); + if (dc->writeback_write_wq) destroy_workqueue(dc->writeback_write_wq); - } + cached_dev_put(dc); wait_for_kthread_stop(); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 332f96b58252..d8034ff0cb24 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1856,9 +1856,7 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t) static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_nowait(q); + return !bdev_nowait(dev->bdev); } static bool dm_table_supports_nowait(struct dm_table *t) diff --git a/drivers/md/md.c b/drivers/md/md.c index 729be2c5296c..a467b492d4ad 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5845,7 +5845,7 @@ int md_run(struct mddev *mddev) } } sysfs_notify_dirent_safe(rdev->sysfs_state); - nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev)); + nowait = nowait && bdev_nowait(rdev->bdev); } if (!bioset_initialized(&mddev->bio_set)) { @@ -6982,7 +6982,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) * If the new disk does not support REQ_NOWAIT, * disable on the whole MD. */ - if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) { + if (!bdev_nowait(rdev->bdev)) { pr_info("%s: Disabling nowait because %pg does not support nowait\n", mdname(mddev), rdev->bdev); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); @@ -8156,7 +8156,6 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) list_for_each(tmp,&all_mddevs) if (!l--) { mddev = list_entry(tmp, struct mddev, all_mddevs); - mddev_get(mddev); if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 78addfe4a0c9..857c49399c28 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -47,7 +47,7 @@ static void dump_zones(struct mddev *mddev) int len = 0; for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - len += snprintf(line+len, 200-len, "%s%pg", k?"/":"", + len += scnprintf(line+len, 200-len, "%s%pg", k?"/":"", conf->devlist[j * raid_disks + k]->bdev); pr_debug("md: zone%d=[%s]\n", j, line); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d6e4cd8a3a..3aa8b6e11d58 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -79,6 +79,21 @@ static void end_reshape(struct r10conf *conf); #include "raid1-10.c" +#define NULL_CMD +#define cmd_before(conf, cmd) \ + do { \ + write_sequnlock_irq(&(conf)->resync_lock); \ + cmd; \ + } while (0) +#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) + +#define wait_event_barrier_cmd(conf, cond, cmd) \ + wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ + cmd_after(conf)) + +#define wait_event_barrier(conf, cond) \ + wait_event_barrier_cmd(conf, cond, NULL_CMD) + /* * for resync bio, r10bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -274,6 +289,12 @@ static void put_buf(struct r10bio *r10_bio) lower_barrier(conf); } +static void wake_up_barrier(struct r10conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + static void reschedule_retry(struct r10bio *r10_bio) { unsigned long flags; @@ -930,78 +951,101 @@ static void flush_pending_writes(struct r10conf *conf) static void raise_barrier(struct r10conf *conf, int force) { + write_seqlock_irq(&conf->resync_lock); BUG_ON(force && !conf->barrier); - spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock); + wait_event_barrier(conf, force || !conf->nr_waiting); /* block any new IO from starting */ - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); /* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, - conf->resync_lock); + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static void lower_barrier(struct r10conf *conf) { unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + + write_seqlock_irqsave(&conf->resync_lock, flags); + WRITE_ONCE(conf->barrier, conf->barrier - 1); + write_sequnlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } +static bool stop_waiting_barrier(struct r10conf *conf) +{ + struct bio_list *bio_list = current->bio_list; + + /* barrier is dropped */ + if (!conf->barrier) + return true; + + /* + * If there are already pending requests (preventing the barrier from + * rising completely), and the pre-process bio queue isn't empty, then + * don't wait, as we need to empty that queue to get the nr_pending + * count down. + */ + if (atomic_read(&conf->nr_pending) && bio_list && + (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) + return true; + + /* move on if recovery thread is blocked by us */ + if (conf->mddev->thread->tsk == current && + test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) && + conf->nr_queued > 0) + return true; + + return false; +} + +static bool wait_barrier_nolock(struct r10conf *conf) +{ + unsigned int seq = read_seqbegin(&conf->resync_lock); + + if (READ_ONCE(conf->barrier)) + return false; + + atomic_inc(&conf->nr_pending); + if (!read_seqretry(&conf->resync_lock, seq)) + return true; + + if (atomic_dec_and_test(&conf->nr_pending)) + wake_up_barrier(conf); + + return false; +} + static bool wait_barrier(struct r10conf *conf, bool nowait) { bool ret = true; - spin_lock_irq(&conf->resync_lock); + if (wait_barrier_nolock(conf)) + return true; + + write_seqlock_irq(&conf->resync_lock); if (conf->barrier) { - struct bio_list *bio_list = current->bio_list; - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * pre-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. - */ /* Return false when nowait flag is set */ if (nowait) { ret = false; } else { + conf->nr_waiting++; raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (atomic_read(&conf->nr_pending) && - bio_list && - (!bio_list_empty(&bio_list[0]) || - !bio_list_empty(&bio_list[1]))) || - /* move on if recovery thread is - * blocked by us - */ - (conf->mddev->thread->tsk == current && - test_bit(MD_RECOVERY_RUNNING, - &conf->mddev->recovery) && - conf->nr_queued > 0), - conf->resync_lock); + wait_event_barrier(conf, stop_waiting_barrier(conf)); + conf->nr_waiting--; } - conf->nr_waiting--; if (!conf->nr_waiting) wake_up(&conf->wait_barrier); } /* Only increment nr_pending when we wait */ if (ret) atomic_inc(&conf->nr_pending); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); return ret; } @@ -1009,7 +1053,7 @@ static void allow_barrier(struct r10conf *conf) { if ((atomic_dec_and_test(&conf->nr_pending)) || (conf->array_freeze_pending)) - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static void freeze_array(struct r10conf *conf, int extra) @@ -1026,27 +1070,24 @@ static void freeze_array(struct r10conf *conf, int extra) * must match the number of pending IOs (nr_pending) before * we continue. */ - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock); conf->array_freeze_pending++; - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); - + wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) == + conf->nr_queued + extra, flush_pending_writes(conf)); conf->array_freeze_pending--; - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r10conf *conf) { /* reverse the effect of the freeze */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; + write_seqlock_irq(&conf->resync_lock); + WRITE_ONCE(conf->barrier, conf->barrier - 1); conf->nr_waiting--; wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static sector_t choose_data_offset(struct r10bio *r10_bio, @@ -1885,7 +1926,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) __make_request(mddev, bio, sectors); /* In case raid10d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); return true; } @@ -1980,7 +2021,7 @@ static int enough(struct r10conf *conf, int ignore) * Otherwise, it must be degraded: * - recovery is interrupted. * - &mddev->degraded is bumped. - + * * @rdev is marked as &Faulty excluding case when array is failed and * &mddev->fail_last_dev is off. */ @@ -4032,7 +4073,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list); - spin_lock_init(&conf->resync_lock); + seqlock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); atomic_set(&conf->nr_pending, 0); @@ -4351,7 +4392,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - conf->barrier = 1; + WRITE_ONCE(conf->barrier, 1); } return conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 5c0804d8bb1f..8c072ce0bc54 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -76,7 +76,7 @@ struct r10conf { /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; - spinlock_t resync_lock; + seqlock_t resync_lock; atomic_t nr_pending; int nr_waiting; int nr_queued; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index f4e1cc1ece43..79c73330020b 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -125,7 +125,7 @@ struct r5l_log { * reclaimed. if it's 0, reclaim spaces * used by io_units which are in * IO_UNIT_STRIPE_END state (eg, reclaim - * dones't wait for specific io_unit + * doesn't wait for specific io_unit * switching to IO_UNIT_STRIPE_END * state) */ wait_queue_head_t iounit_wait; @@ -1327,9 +1327,9 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, * superblock is updated to new log tail. Updating superblock (either * directly call md_update_sb() or depend on md thread) must hold * reconfig mutex. On the other hand, raid5_quiesce is called with - * reconfig_mutex hold. The first step of raid5_quiesce() is waitting - * for all IO finish, hence waitting for reclaim thread, while reclaim - * thread is calling this function and waitting for reconfig mutex. So + * reconfig_mutex hold. The first step of raid5_quiesce() is waiting + * for all IO finish, hence waiting for reclaim thread, while reclaim + * thread is calling this function and waiting for reconfig mutex. So * there is a deadlock. We workaround this issue with a trylock. * FIXME: we could miss discard if we can't take reconfig mutex */ @@ -1923,7 +1923,8 @@ r5c_recovery_alloc_stripe( { struct stripe_head *sh; - sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); + sh = raid5_get_active_stripe(conf, NULL, stripe_sect, + noblock ? R5_GAS_NOBLOCK : 0); if (!sh) return NULL; /* no more stripe available */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 31a0cbf63384..7b820b81d8c2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,6 +36,7 @@ */ #include <linux/blkdev.h> +#include <linux/delay.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> @@ -789,87 +790,80 @@ struct stripe_request_ctx { */ static bool is_inactive_blocked(struct r5conf *conf, int hash) { - int active = atomic_read(&conf->active_stripes); - if (list_empty(conf->inactive_list + hash)) return false; if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) return true; - return active < (conf->max_nr_stripes * 3 / 4); + return (atomic_read(&conf->active_stripes) < + (conf->max_nr_stripes * 3 / 4)); } -static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t sector, - bool previous, bool noblock, bool noquiesce) + unsigned int flags) { struct stripe_head *sh; int hash = stripe_hash_locks_hash(conf, sector); + int previous = !!(flags & R5_GAS_PREVIOUS); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); spin_lock_irq(conf->hash_locks + hash); -retry: - if (!noquiesce && conf->quiesce) { - /* - * Must release the reference to batch_last before waiting, - * on quiesce, otherwise the batch_last will hold a reference - * to a stripe and raid5_quiesce() will deadlock waiting for - * active_stripes to go to zero. - */ - if (ctx && ctx->batch_last) { - raid5_release_stripe(ctx->batch_last); - ctx->batch_last = NULL; - } - - wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce, - *(conf->hash_locks + hash)); - } + for (;;) { + if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) { + /* + * Must release the reference to batch_last before + * waiting, on quiesce, otherwise the batch_last will + * hold a reference to a stripe and raid5_quiesce() + * will deadlock waiting for active_stripes to go to + * zero. + */ + if (ctx && ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } - sh = find_get_stripe(conf, sector, conf->generation - previous, hash); - if (sh) - goto out; + wait_event_lock_irq(conf->wait_for_quiescent, + !conf->quiesce, + *(conf->hash_locks + hash)); + } - if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) - goto wait_for_stripe; + sh = find_get_stripe(conf, sector, conf->generation - previous, + hash); + if (sh) + break; - sh = get_free_stripe(conf, hash); - if (sh) { - r5c_check_stripe_cache_usage(conf); - init_stripe(sh, sector, previous); - atomic_inc(&sh->count); - goto out; - } + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { + sh = get_free_stripe(conf, hash); + if (sh) { + r5c_check_stripe_cache_usage(conf); + init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + break; + } - if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) - set_bit(R5_ALLOC_MORE, &conf->cache_state); + if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, &conf->cache_state); + } -wait_for_stripe: - if (noblock) - goto out; + if (flags & R5_GAS_NOBLOCK) + break; - set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - r5l_wake_reclaim(conf->log, 0); - wait_event_lock_irq(conf->wait_for_stripe, - is_inactive_blocked(conf, hash), - *(conf->hash_locks + hash)); - clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - goto retry; + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); + wait_event_lock_irq(conf->wait_for_stripe, + is_inactive_blocked(conf, hash), + *(conf->hash_locks + hash)); + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + } -out: spin_unlock_irq(conf->hash_locks + hash); return sh; } -struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, - sector_t sector, bool previous, bool noblock, bool noquiesce) -{ - return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock, - noquiesce); -} - static bool is_full_stripe_write(struct stripe_head *sh) { BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); @@ -4047,7 +4041,7 @@ static void handle_stripe_fill(struct stripe_head *sh, * back cache (prexor with orig_page, and then xor with * page) in the read path */ - if (s->injournal && s->failed) { + if (s->to_read && s->injournal && s->failed) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); goto out; @@ -4636,7 +4630,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) sector_t bn = raid5_compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); + sh2 = raid5_get_active_stripe(conf, NULL, s, + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -5273,7 +5268,9 @@ static void handle_stripe(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh_src - = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); + = raid5_get_active_stripe(conf, NULL, sh->sector, + R5_GAS_PREVIOUS | R5_GAS_NOBLOCK | + R5_GAS_NOQUIESCE); if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { /* sh cannot be written until sh_src has been read. * so arrange for sh to be delayed a little @@ -5542,7 +5539,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, &bad_sectors)) { - bio_put(raid_bio); rdev_dec_pending(rdev, mddev); return 0; } @@ -5823,7 +5819,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) DEFINE_WAIT(w); int d; again: - sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); + sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); @@ -5978,7 +5974,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, enum stripe_result ret; struct stripe_head *sh; sector_t new_sector; - int previous = 0; + int previous = 0, flags = 0; int seq, dd_idx; seq = read_seqcount_begin(&conf->gen_lock); @@ -6012,8 +6008,11 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, new_sector, logical_sector); - sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous, - (bi->bi_opf & REQ_RAHEAD), 0); + if (previous) + flags |= R5_GAS_PREVIOUS; + if (bi->bi_opf & REQ_RAHEAD) + flags |= R5_GAS_NOBLOCK; + sh = raid5_get_active_stripe(conf, ctx, new_sector, flags); if (unlikely(!sh)) { /* cannot get stripe, just give-up */ bi->bi_status = BLK_STS_IOERR; @@ -6362,7 +6361,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { int j; int skipped_disk = 0; - sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); + sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i, + R5_GAS_NOQUIESCE); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -6411,7 +6411,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); + sh = raid5_get_active_stripe(conf, NULL, first_sector, + R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); @@ -6531,9 +6532,10 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); - sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); + sh = raid5_get_active_stripe(conf, NULL, sector_nr, + R5_GAS_NOBLOCK); if (sh == NULL) { - sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); + sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -6596,8 +6598,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, /* already done this stripe */ continue; - sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); - + sh = raid5_get_active_stripe(conf, NULL, sector, + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); if (!sh) { /* failed to get a stripe - must wait */ conf->retry_read_aligned = raid_bio; @@ -6781,7 +6783,18 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); + + /* + * Waiting on MD_SB_CHANGE_PENDING below may deadlock + * seeing md_check_recovery() is needed to clear + * the flag when using mdmon. + */ + continue; } + + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a5082bed83c8..e873938a6125 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -803,16 +803,24 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx) } #endif -extern void md_raid5_kick_device(struct r5conf *conf); -extern int raid5_set_cache_size(struct mddev *mddev, int size); -extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); -extern void raid5_release_stripe(struct stripe_head *sh); -extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, - int previous, int *dd_idx, - struct stripe_head *sh); -extern struct stripe_head * -raid5_get_active_stripe(struct r5conf *conf, sector_t sector, - bool previous, bool noblock, bool noquiesce); -extern int raid5_calc_degraded(struct r5conf *conf); -extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); +void md_raid5_kick_device(struct r5conf *conf); +int raid5_set_cache_size(struct mddev *mddev, int size); +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); +void raid5_release_stripe(struct stripe_head *sh); +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, struct stripe_head *sh); + +struct stripe_request_ctx; +/* get stripe from previous generation (when reshaping) */ +#define R5_GAS_PREVIOUS (1 << 0) +/* do not block waiting for a free stripe */ +#define R5_GAS_NOBLOCK (1 << 1) +/* do not block waiting for quiesce to be released */ +#define R5_GAS_NOQUIESCE (1 << 2) +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, + struct stripe_request_ctx *ctx, sector_t sector, + unsigned int flags); + +int raid5_calc_degraded(struct r5conf *conf); +int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #endif diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 45ef8e8ddc84..64f599a64a7f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1111,8 +1111,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, - struct nvme_command *cmd, int status) +void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, + struct nvme_command *cmd, int status) { if (effects & NVME_CMD_EFFECTS_CSE_MASK) { nvme_unfreeze(ctrl); @@ -1148,21 +1148,16 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, break; } } +EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU); -int nvme_execute_passthru_rq(struct request *rq) +int nvme_execute_passthru_rq(struct request *rq, u32 *effects) { struct nvme_command *cmd = nvme_req(rq)->cmd; struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; struct nvme_ns *ns = rq->q->queuedata; - u32 effects; - int ret; - effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - ret = nvme_execute_rq(rq, false); - if (effects) /* nothing to be done for zero cmd effects */ - nvme_passthru_end(ctrl, effects, cmd, ret); - - return ret; + *effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + return nvme_execute_rq(rq, false); } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); @@ -2696,7 +2691,7 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { - strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); + strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); return; } @@ -2704,7 +2699,11 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); } - /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ + /* + * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe + * Base Specification 2.0. It is slightly different from the format + * specified there due to historic reasons, and we can't change it now. + */ off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, "nqn.2014.08.org.nvmexpress:%04x%04x", le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); @@ -2894,7 +2893,6 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) nvme_init_subnqn(subsys, ctrl, id); memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); memcpy(subsys->model, id->mn, sizeof(subsys->model)); - memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; @@ -3113,6 +3111,8 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->quirks |= core_quirks[i].quirks; } } + memcpy(ctrl->subsys->firmware_rev, id->fr, + sizeof(ctrl->subsys->firmware_rev)); if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); @@ -4805,6 +4805,108 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, } EXPORT_SYMBOL_GPL(nvme_complete_async_event); +int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int flags, + unsigned int cmd_size) +{ + int ret; + + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; + if (ctrl->ops->flags & NVME_F_FABRICS) + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = ctrl->numa_node; + set->flags = flags; + set->cmd_size = cmd_size; + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = NVME_ADMIN_TIMEOUT; + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ret; + + ctrl->admin_q = blk_mq_init_queue(set); + if (IS_ERR(ctrl->admin_q)) { + ret = PTR_ERR(ctrl->admin_q); + goto out_free_tagset; + } + + if (ctrl->ops->flags & NVME_F_FABRICS) { + ctrl->fabrics_q = blk_mq_init_queue(set); + if (IS_ERR(ctrl->fabrics_q)) { + ret = PTR_ERR(ctrl->fabrics_q); + goto out_cleanup_admin_q; + } + } + + ctrl->admin_tagset = set; + return 0; + +out_cleanup_admin_q: + blk_mq_destroy_queue(ctrl->fabrics_q); +out_free_tagset: + blk_mq_free_tag_set(ctrl->admin_tagset); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); + +void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) +{ + blk_mq_destroy_queue(ctrl->admin_q); + if (ctrl->ops->flags & NVME_F_FABRICS) + blk_mq_destroy_queue(ctrl->fabrics_q); + blk_mq_free_tag_set(ctrl->admin_tagset); +} +EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); + +int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int flags, + unsigned int cmd_size) +{ + int ret; + + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->queue_depth = ctrl->sqsize + 1; + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = ctrl->numa_node; + set->flags = flags; + set->cmd_size = cmd_size, + set->driver_data = ctrl; + set->nr_hw_queues = ctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + if (ops->map_queues) + set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ret; + + if (ctrl->ops->flags & NVME_F_FABRICS) { + ctrl->connect_q = blk_mq_init_queue(set); + if (IS_ERR(ctrl->connect_q)) { + ret = PTR_ERR(ctrl->connect_q); + goto out_free_tag_set; + } + } + + ctrl->tagset = set; + return 0; + +out_free_tag_set: + blk_mq_free_tag_set(set); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); + +void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl) +{ + if (ctrl->ops->flags & NVME_F_FABRICS) + blk_mq_destroy_queue(ctrl->connect_q); + blk_mq_free_tag_set(ctrl->tagset); +} +EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set); + void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); @@ -4824,6 +4926,16 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_enable_aen(ctrl); + /* + * persistent discovery controllers need to send indication to userspace + * to re-read the discovery log page to learn about possible changes + * that were missed. We identify persistent discovery controllers by + * checking that they started once before, hence are reconnecting back. + */ + if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) && + nvme_discovery_ctrl(ctrl)) + nvme_change_uevent(ctrl, "NVME_EVENT=rediscover"); + if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); nvme_start_queues(ctrl); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 10cc4a814602..ce27276f552d 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -49,7 +49,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) goto out_unlock; kref_init(&host->ref); - strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + strscpy(host->nqn, hostnqn, NVMF_NQN_SIZE); list_add_tail(&host->list, &nvmf_hosts); out_unlock: @@ -971,13 +971,17 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, return false; /* - * Checking the local address is rough. In most cases, none is specified - * and the host port is selected by the stack. + * Checking the local address or host interfaces is rough. + * + * In most cases, none is specified and the host port or + * host interface is selected by the stack. * * Assume no match if: - * - local address is specified and address is not the same - * - local address is not specified but remote is, or vice versa - * (admin using specific host_traddr when it matters). + * - local address or host interface is specified and address + * or host interface is not the same + * - local address or host interface is not specified but + * remote is, or vice versa (admin using specific + * host_traddr/host_iface when it matters). */ if ((opts->mask & NVMF_OPT_HOST_TRADDR) && (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) { @@ -988,6 +992,15 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, return false; } + if ((opts->mask & NVMF_OPT_HOST_IFACE) && + (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) { + if (strcmp(opts->host_iface, ctrl->opts->host_iface)) + return false; + } else if ((opts->mask & NVMF_OPT_HOST_IFACE) || + (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) { + return false; + } + return true; } EXPORT_SYMBOL_GPL(nvmf_ip_options_match); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 127abaf9ba5d..5d57a042dbca 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1829,7 +1829,7 @@ nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq, { struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - return __nvme_fc_exit_request(set->driver_data, op); + return __nvme_fc_exit_request(to_fc_ctrl(set->driver_data), op); } static int @@ -2135,7 +2135,7 @@ static int nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_fc_ctrl *ctrl = set->driver_data; + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(set->driver_data); struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq); int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; @@ -2206,36 +2206,28 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) } } -static inline void -__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl, - unsigned int qidx) +static inline int +__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int qidx) { + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(data); struct nvme_fc_queue *queue = &ctrl->queues[qidx]; hctx->driver_data = queue; queue->hctx = hctx; + return 0; } static int -nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_fc_ctrl *ctrl = data; - - __nvme_fc_init_hctx(hctx, ctrl, hctx_idx + 1); - - return 0; + return __nvme_fc_init_hctx(hctx, data, hctx_idx + 1); } static int nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_fc_ctrl *ctrl = data; - - __nvme_fc_init_hctx(hctx, ctrl, hctx_idx); - - return 0; + return __nvme_fc_init_hctx(hctx, data, hctx_idx); } static void @@ -2391,10 +2383,8 @@ nvme_fc_ctrl_free(struct kref *ref) container_of(ref, struct nvme_fc_ctrl, ref); unsigned long flags; - if (ctrl->ctrl.tagset) { - blk_mq_destroy_queue(ctrl->ctrl.connect_q); - blk_mq_free_tag_set(&ctrl->tag_set); - } + if (ctrl->ctrl.tagset) + nvme_remove_io_tag_set(&ctrl->ctrl); /* remove from rport list */ spin_lock_irqsave(&ctrl->rport->lock, flags); @@ -2402,9 +2392,7 @@ nvme_fc_ctrl_free(struct kref *ref) spin_unlock_irqrestore(&ctrl->rport->lock, flags); nvme_start_admin_queue(&ctrl->ctrl); - blk_mq_destroy_queue(ctrl->ctrl.admin_q); - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); - blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvme_remove_admin_tag_set(&ctrl->ctrl); kfree(ctrl->queues); @@ -2860,9 +2848,9 @@ nvme_fc_complete_rq(struct request *rq) nvme_fc_ctrl_put(ctrl); } -static int nvme_fc_map_queues(struct blk_mq_tag_set *set) +static void nvme_fc_map_queues(struct blk_mq_tag_set *set) { - struct nvme_fc_ctrl *ctrl = set->driver_data; + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(set->driver_data); int i; for (i = 0; i < set->nr_maps; i++) { @@ -2880,7 +2868,6 @@ static int nvme_fc_map_queues(struct blk_mq_tag_set *set) else blk_mq_map_queues(map); } - return 0; } static const struct blk_mq_ops nvme_fc_mq_ops = { @@ -2915,32 +2902,16 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); - memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); - ctrl->tag_set.ops = &nvme_fc_mq_ops; - ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; - ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; - ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; - ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - ctrl->tag_set.cmd_size = - struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, - ctrl->lport->ops->fcprqst_priv_sz); - ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; - ctrl->tag_set.timeout = NVME_IO_TIMEOUT; - - ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, + &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE, + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, + ctrl->lport->ops->fcprqst_priv_sz)); if (ret) return ret; - ctrl->ctrl.tagset = &ctrl->tag_set; - - ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); - if (ret) - goto out_free_tag_set; - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) - goto out_cleanup_blk_queue; + goto out_cleanup_tagset; ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) @@ -2952,10 +2923,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) out_delete_hw_queues: nvme_fc_delete_hw_io_queues(ctrl); -out_cleanup_blk_queue: - blk_mq_destroy_queue(ctrl->ctrl.connect_q); -out_free_tag_set: - blk_mq_free_tag_set(&ctrl->tag_set); +out_cleanup_tagset: + nvme_remove_io_tag_set(&ctrl->ctrl); nvme_fc_free_io_queues(ctrl); /* force put free routine to ignore io queues */ @@ -3166,15 +3135,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) "to maxcmd\n", opts->queue_size, ctrl->ctrl.maxcmd); opts->queue_size = ctrl->ctrl.maxcmd; - } - - if (opts->queue_size > ctrl->ctrl.sqsize + 1) { - /* warn if sqsize is lower than queue_size */ - dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl sqsize %u, reducing " - "to sqsize\n", - opts->queue_size, ctrl->ctrl.sqsize + 1); - opts->queue_size = ctrl->ctrl.sqsize + 1; + ctrl->ctrl.sqsize = opts->queue_size - 1; } ret = nvme_fc_init_aen_ops(ctrl); @@ -3547,35 +3508,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_fc_init_queue(ctrl, 0); - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); - ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; - ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS; - ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; - ctrl->admin_tag_set.cmd_size = - struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, - ctrl->lport->ops->fcprqst_priv_sz); - ctrl->admin_tag_set.driver_data = ctrl; - ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; - ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; - - ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, + &nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED, + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, + ctrl->lport->ops->fcprqst_priv_sz)); if (ret) goto out_free_queues; - ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; - - ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.fabrics_q)) { - ret = PTR_ERR(ctrl->ctrl.fabrics_q); - goto out_free_admin_tag_set; - } - - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.admin_q)) { - ret = PTR_ERR(ctrl->ctrl.admin_q); - goto out_cleanup_fabrics_q; - } /* * Would have been nice to init io queues tag set as well. @@ -3586,7 +3524,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); if (ret) - goto out_cleanup_admin_q; + goto out_cleanup_tagset; /* at this point, teardown path changes to ref counting on nvme ctrl */ @@ -3641,12 +3579,8 @@ fail_ctrl: return ERR_PTR(-EIO); -out_cleanup_admin_q: - blk_mq_destroy_queue(ctrl->ctrl.admin_q); -out_cleanup_fabrics_q: - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); -out_free_admin_tag_set: - blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_cleanup_tagset: + nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_queues: kfree(ctrl->queues); out_free_ida: diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 548aca8b5b9f..357791ff0623 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -136,9 +136,11 @@ static int nvme_submit_user_cmd(struct request_queue *q, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u64 *result, unsigned timeout, bool vec) { + struct nvme_ctrl *ctrl; struct request *req; void *meta = NULL; struct bio *bio; + u32 effects; int ret; req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer, @@ -147,8 +149,9 @@ static int nvme_submit_user_cmd(struct request_queue *q, return PTR_ERR(req); bio = req->bio; + ctrl = nvme_req(req)->ctrl; - ret = nvme_execute_passthru_rq(req); + ret = nvme_execute_passthru_rq(req, &effects); if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); @@ -158,6 +161,10 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (bio) blk_rq_unmap_user(bio); blk_mq_free_request(req); + + if (effects) + nvme_passthru_end(ctrl, effects, cmd, ret); + return ret; } @@ -824,11 +831,17 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; dev_warn(ctrl->device, "resetting controller\n"); return nvme_reset_ctrl_sync(ctrl); case NVME_IOCTL_SUBSYS_RESET: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; return nvme_reset_subsystem(ctrl); case NVME_IOCTL_RESCAN: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; nvme_queue_scan(ctrl); return 0; default: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 216acbe953b3..a29877217ee6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -233,6 +233,12 @@ struct nvme_fault_inject { #endif }; +enum nvme_ctrl_flags { + NVME_CTRL_FAILFAST_EXPIRED = 0, + NVME_CTRL_ADMIN_Q_STOPPED = 1, + NVME_CTRL_STARTED_ONCE = 2, +}; + struct nvme_ctrl { bool comp_seen; enum nvme_ctrl_state state; @@ -354,8 +360,6 @@ struct nvme_ctrl { u16 maxcmd; int nr_reconnects; unsigned long flags; -#define NVME_CTRL_FAILFAST_EXPIRED 0 -#define NVME_CTRL_ADMIN_Q_STOPPED 1 struct nvmf_ctrl_options *opts; struct page *discard_page; @@ -602,11 +606,23 @@ static inline void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inj) static inline void nvme_should_fail(struct request *req) {} #endif +bool nvme_wait_reset(struct nvme_ctrl *ctrl); +int nvme_try_sched_reset(struct nvme_ctrl *ctrl); + static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) { + int ret; + if (!ctrl->subsystem) return -ENOTTY; - return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); + if (!nvme_wait_reset(ctrl)) + return -EBUSY; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); + if (ret) + return ret; + + return nvme_try_sched_reset(ctrl); } /* @@ -712,7 +728,6 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -bool nvme_wait_reset(struct nvme_ctrl *ctrl); int nvme_disable_ctrl(struct nvme_ctrl *ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); @@ -722,6 +737,14 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); +int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int flags, + unsigned int cmd_size); +void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl); +int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int flags, + unsigned int cmd_size); +void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); @@ -802,7 +825,6 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); -int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); void nvme_queue_scan(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, @@ -972,14 +994,6 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) } #endif -static inline int nvme_ctrl_init_connect_q(struct nvme_ctrl *ctrl) -{ - ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ctrl->connect_q)) - return PTR_ERR(ctrl->connect_q); - return 0; -} - static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { return dev_to_disk(dev)->private_data; @@ -1027,7 +1041,9 @@ static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); -int nvme_execute_passthru_rq(struct request *rq); +int nvme_execute_passthru_rq(struct request *rq, u32 *effects); +void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, + struct nvme_command *cmd, int status); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); void nvme_put_ns(struct nvme_ns *ns); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 67d3335e9cc8..9aafc1ed6439 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -226,12 +226,12 @@ struct nvme_queue { struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - struct nvme_queue *nvmeq; bool use_sgl; - int aborted; - int npages; /* In the PRP list. 0 means small pool in use */ - dma_addr_t first_dma; + bool aborted; + s8 nr_allocations; /* PRP list pool allocations. 0 means small + pool in use */ unsigned int dma_len; /* length of single DMA segment mapping */ + dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; }; @@ -430,11 +430,6 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set, { struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; - struct nvme_queue *nvmeq = &dev->queues[queue_idx]; - - BUG_ON(!nvmeq); - iod->nvmeq = nvmeq; nvme_req(req)->ctrl = &dev->ctrl; nvme_req(req)->cmd = &iod->cmd; @@ -450,7 +445,7 @@ static int queue_irq_offset(struct nvme_dev *dev) return 0; } -static int nvme_pci_map_queues(struct blk_mq_tag_set *set) +static void nvme_pci_map_queues(struct blk_mq_tag_set *set) { struct nvme_dev *dev = set->driver_data; int i, qoff, offset; @@ -477,8 +472,6 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) qoff += map->nr_queues; offset += map->nr_queues; } - - return 0; } /* @@ -528,7 +521,7 @@ static void **nvme_pci_iod_list(struct request *req) static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; @@ -536,7 +529,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) if (!nvme_ctrl_sgl_supported(&dev->ctrl)) return false; - if (!iod->nvmeq->qid) + if (!nvmeq->qid) return false; if (!sgl_threshold || avg_seg_size < sgl_threshold) return false; @@ -550,7 +543,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) dma_addr_t dma_addr = iod->first_dma; int i; - for (i = 0; i < iod->npages; i++) { + for (i = 0; i < iod->nr_allocations; i++) { __le64 *prp_list = nvme_pci_iod_list(req)[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); @@ -566,7 +559,7 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) dma_addr_t dma_addr = iod->first_dma; int i; - for (i = 0; i < iod->npages; i++) { + for (i = 0; i < iod->nr_allocations; i++) { struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); @@ -589,7 +582,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - if (iod->npages == 0) + if (iod->nr_allocations == 0) dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], iod->first_dma); else if (iod->use_sgl) @@ -651,15 +644,15 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; - iod->npages = 0; + iod->nr_allocations = 0; } else { pool = dev->prp_page_pool; - iod->npages = 1; + iod->nr_allocations = 1; } prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) { - iod->npages = -1; + iod->nr_allocations = -1; return BLK_STS_RESOURCE; } list[0] = prp_list; @@ -671,7 +664,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; - list[iod->npages++] = prp_list; + list[iod->nr_allocations++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); i = 1; @@ -746,15 +739,15 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { pool = dev->prp_small_pool; - iod->npages = 0; + iod->nr_allocations = 0; } else { pool = dev->prp_page_pool; - iod->npages = 1; + iod->nr_allocations = 1; } sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); if (!sg_list) { - iod->npages = -1; + iod->nr_allocations = -1; return BLK_STS_RESOURCE; } @@ -773,7 +766,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, goto free_sgls; i = 0; - nvme_pci_iod_list(req)[iod->npages++] = sg_list; + nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list; sg_list[i++] = *link; nvme_pci_sgl_set_seg(link, sgl_dma, entries); } @@ -833,6 +826,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, int rc; if (blk_rq_nr_phys_segments(req) == 1) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { @@ -840,7 +834,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); - if (iod->nvmeq->qid && sgl_threshold && + if (nvmeq->qid && sgl_threshold && nvme_ctrl_sgl_supported(&dev->ctrl)) return nvme_setup_sgl_simple(dev, req, &cmnd->rw, &bv); @@ -898,8 +892,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; - iod->aborted = 0; - iod->npages = -1; + iod->aborted = false; + iod->nr_allocations = -1; iod->sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); @@ -1019,12 +1013,16 @@ static void nvme_queue_rqs(struct request **rqlist) static __always_inline void nvme_pci_unmap_rq(struct request *req) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_dev *dev = iod->nvmeq->dev; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + + if (blk_integrity_rq(req)) { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - if (blk_integrity_rq(req)) dma_unmap_page(dev->dev, iod->meta_dma, rq_integrity_vec(req)->bv_len, rq_data_dir(req)); + } + if (blk_rq_nr_phys_segments(req)) nvme_unmap_data(dev, req); } @@ -1272,8 +1270,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) static void abort_endio(struct request *req, blk_status_t error) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = iod->nvmeq; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", nvme_req(req)->status); @@ -1335,7 +1332,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) static enum blk_eh_timer_return nvme_timeout(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = iod->nvmeq; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd = { }; @@ -1416,7 +1413,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } - iod->aborted = 1; + iod->aborted = true; cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = nvme_cid(req); @@ -2529,9 +2526,11 @@ static void nvme_pci_alloc_tag_set(struct nvme_dev *dev) set->ops = &nvme_mq_ops; set->nr_hw_queues = dev->online_queues - 1; - set->nr_maps = 2; /* default + read */ + set->nr_maps = 1; + if (dev->io_queues[HCTX_TYPE_READ]) + set->nr_maps = 2; if (dev->io_queues[HCTX_TYPE_POLL]) - set->nr_maps++; + set->nr_maps = 3; set->timeout = NVME_IO_TIMEOUT; set->numa_node = dev->ctrl.numa_node; set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; @@ -2834,6 +2833,8 @@ static void nvme_reset_work(struct work_struct *work) nvme_start_admin_queue(&dev->ctrl); } + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); + /* * Limit the max command size to prevent iod->sg allocations going * over a single page. @@ -2846,7 +2847,6 @@ static void nvme_reset_work(struct work_struct *work) * Don't limit the IOMMU merged segment size. */ dma_set_max_seg_size(dev->dev, 0xffffffff); - dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); mutex_unlock(&dev->shutdown_lock); @@ -3569,6 +3569,8 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); + BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) > + S8_MAX); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3100643be299..5ad0ab2853a4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -295,7 +295,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_rdma_ctrl *ctrl = set->driver_data; + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data); struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; @@ -320,7 +320,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data); struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); @@ -332,7 +332,7 @@ static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data); struct nvme_rdma_queue *queue = &ctrl->queues[0]; BUG_ON(hctx_idx != 0); @@ -696,11 +696,12 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) return ret; } -static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl, + int first, int last) { int i, ret = 0; - for (i = 1; i < ctrl->ctrl.queue_count; i++) { + for (i = first; i < last; i++) { ret = nvme_rdma_start_queue(ctrl, i); if (ret) goto out_stop_queues; @@ -709,7 +710,7 @@ static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) return 0; out_stop_queues: - for (i--; i >= 1; i--) + for (i--; i >= first; i--) nvme_rdma_stop_queue(&ctrl->queues[i]); return ret; } @@ -787,64 +788,21 @@ out_free_queues: return ret; } -static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *nctrl) +static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl) { - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - struct blk_mq_tag_set *set = &ctrl->admin_tag_set; - int ret; + unsigned int cmd_size = sizeof(struct nvme_rdma_request) + + NVME_RDMA_DATA_SGL_SIZE; - memset(set, 0, sizeof(*set)); - set->ops = &nvme_rdma_admin_mq_ops; - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; - set->reserved_tags = NVMF_RESERVED_TAGS; - set->numa_node = nctrl->numa_node; - set->cmd_size = sizeof(struct nvme_rdma_request) + - NVME_RDMA_DATA_SGL_SIZE; - set->driver_data = ctrl; - set->nr_hw_queues = 1; - set->timeout = NVME_ADMIN_TIMEOUT; - set->flags = BLK_MQ_F_NO_SCHED; - ret = blk_mq_alloc_tag_set(set); - if (!ret) - ctrl->ctrl.admin_tagset = set; - return ret; -} - -static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - struct blk_mq_tag_set *set = &ctrl->tag_set; - int ret; + if (ctrl->max_integrity_segments) + cmd_size += sizeof(struct nvme_rdma_sgl) + + NVME_RDMA_METADATA_SGL_SIZE; - memset(set, 0, sizeof(*set)); - set->ops = &nvme_rdma_mq_ops; - set->queue_depth = nctrl->sqsize + 1; - set->reserved_tags = NVMF_RESERVED_TAGS; - set->numa_node = nctrl->numa_node; - set->flags = BLK_MQ_F_SHOULD_MERGE; - set->cmd_size = sizeof(struct nvme_rdma_request) + - NVME_RDMA_DATA_SGL_SIZE; - if (nctrl->max_integrity_segments) - set->cmd_size += sizeof(struct nvme_rdma_sgl) + - NVME_RDMA_METADATA_SGL_SIZE; - set->driver_data = ctrl; - set->nr_hw_queues = nctrl->queue_count - 1; - set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; - ret = blk_mq_alloc_tag_set(set); - if (!ret) - ctrl->ctrl.tagset = set; - return ret; + return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set, + &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, cmd_size); } -static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, - bool remove) +static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) { - if (remove) { - blk_mq_destroy_queue(ctrl->ctrl.admin_q); - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); - blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); - } if (ctrl->async_event_sqe.data) { cancel_work_sync(&ctrl->ctrl.async_event_work); nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, @@ -886,26 +844,19 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_queue; if (new) { - error = nvme_rdma_alloc_admin_tag_set(&ctrl->ctrl); + error = nvme_alloc_admin_tag_set(&ctrl->ctrl, + &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops, + BLK_MQ_F_NO_SCHED, + sizeof(struct nvme_rdma_request) + + NVME_RDMA_DATA_SGL_SIZE); if (error) goto out_free_async_qe; - ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.fabrics_q)) { - error = PTR_ERR(ctrl->ctrl.fabrics_q); - goto out_free_tagset; - } - - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.admin_q)) { - error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_cleanup_fabrics_q; - } } error = nvme_rdma_start_queue(ctrl, 0); if (error) - goto out_cleanup_queue; + goto out_remove_admin_tag_set; error = nvme_enable_ctrl(&ctrl->ctrl); if (error) @@ -932,15 +883,9 @@ out_quiesce_queue: out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); -out_cleanup_queue: - if (new) - blk_mq_destroy_queue(ctrl->ctrl.admin_q); -out_cleanup_fabrics_q: +out_remove_admin_tag_set: if (new) - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); -out_free_tagset: - if (new) - blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); + nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_async_qe: if (ctrl->async_event_sqe.data) { nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, @@ -952,19 +897,9 @@ out_free_queue: return error; } -static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, - bool remove) -{ - if (remove) { - blk_mq_destroy_queue(ctrl->ctrl.connect_q); - blk_mq_free_tag_set(ctrl->ctrl.tagset); - } - nvme_rdma_free_io_queues(ctrl); -} - static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) { - int ret; + int ret, nr_queues; ret = nvme_rdma_alloc_io_queues(ctrl); if (ret) @@ -974,15 +909,17 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl); if (ret) goto out_free_io_queues; - - ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); - if (ret) - goto out_free_tag_set; } - ret = nvme_rdma_start_io_queues(ctrl); + /* + * Only start IO queues for which we have allocated the tagset + * and limitted it to the available queues. On reconnects, the + * queue number might have changed. + */ + nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count); + ret = nvme_rdma_start_io_queues(ctrl, 1, nr_queues); if (ret) - goto out_cleanup_connect_q; + goto out_cleanup_tagset; if (!new) { nvme_start_queues(&ctrl->ctrl); @@ -1000,19 +937,25 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) nvme_unfreeze(&ctrl->ctrl); } + /* + * If the number of queues has increased (reconnect case) + * start all new queues now. + */ + ret = nvme_rdma_start_io_queues(ctrl, nr_queues, + ctrl->tag_set.nr_hw_queues + 1); + if (ret) + goto out_wait_freeze_timed_out; + return 0; out_wait_freeze_timed_out: nvme_stop_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); -out_cleanup_connect_q: +out_cleanup_tagset: nvme_cancel_tagset(&ctrl->ctrl); if (new) - blk_mq_destroy_queue(ctrl->ctrl.connect_q); -out_free_tag_set: - if (new) - blk_mq_free_tag_set(ctrl->ctrl.tagset); + nvme_remove_io_tag_set(&ctrl->ctrl); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret; @@ -1025,9 +968,11 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); - if (remove) + if (remove) { nvme_start_admin_queue(&ctrl->ctrl); - nvme_rdma_destroy_admin_queue(ctrl, remove); + nvme_remove_admin_tag_set(&ctrl->ctrl); + } + nvme_rdma_destroy_admin_queue(ctrl); } static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, @@ -1039,9 +984,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); - if (remove) + if (remove) { nvme_start_queues(&ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, remove); + nvme_remove_io_tag_set(&ctrl->ctrl); + } + nvme_rdma_free_io_queues(ctrl); } } @@ -1163,14 +1110,18 @@ destroy_io: nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, new); + if (new) + nvme_remove_io_tag_set(&ctrl->ctrl); + nvme_rdma_free_io_queues(ctrl); } destroy_admin: nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); - nvme_rdma_destroy_admin_queue(ctrl, new); + if (new) + nvme_remove_admin_tag_set(&ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl); return ret; } @@ -2188,9 +2139,9 @@ static void nvme_rdma_complete_rq(struct request *rq) nvme_complete_rq(rq); } -static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) +static void nvme_rdma_map_queues(struct blk_mq_tag_set *set) { - struct nvme_rdma_ctrl *ctrl = set->driver_data; + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data); struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { @@ -2231,8 +2182,6 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) ctrl->io_queues[HCTX_TYPE_DEFAULT], ctrl->io_queues[HCTX_TYPE_READ], ctrl->io_queues[HCTX_TYPE_POLL]); - - return 0; } static const struct blk_mq_ops nvme_rdma_mq_ops = { diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d5871fd6f769..93e2e313fa70 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -133,7 +133,6 @@ struct nvme_tcp_queue { /* send state */ struct nvme_tcp_request *request; - int queue_size; u32 maxh2cdata; size_t cmnd_capsule_len; struct nvme_tcp_ctrl *ctrl; @@ -463,7 +462,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_tcp_cmd_pdu *pdu; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; @@ -487,7 +486,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; hctx->driver_data = queue; @@ -497,7 +496,7 @@ static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); struct nvme_tcp_queue *queue = &ctrl->queues[0]; hctx->driver_data = queue; @@ -1476,8 +1475,7 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); } -static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, - int qid, size_t queue_size) +static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; @@ -1489,7 +1487,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, INIT_LIST_HEAD(&queue->send_list); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); - queue->queue_size = queue_size; if (qid > 0) queue->cmnd_capsule_len = nctrl->ioccsz * 16; @@ -1687,51 +1684,6 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) return ret; } -static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl) -{ - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); - struct blk_mq_tag_set *set = &ctrl->admin_tag_set; - int ret; - - memset(set, 0, sizeof(*set)); - set->ops = &nvme_tcp_admin_mq_ops; - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; - set->reserved_tags = NVMF_RESERVED_TAGS; - set->numa_node = nctrl->numa_node; - set->flags = BLK_MQ_F_BLOCKING; - set->cmd_size = sizeof(struct nvme_tcp_request); - set->driver_data = ctrl; - set->nr_hw_queues = 1; - set->timeout = NVME_ADMIN_TIMEOUT; - ret = blk_mq_alloc_tag_set(set); - if (!ret) - nctrl->admin_tagset = set; - return ret; -} - -static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl) -{ - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); - struct blk_mq_tag_set *set = &ctrl->tag_set; - int ret; - - memset(set, 0, sizeof(*set)); - set->ops = &nvme_tcp_mq_ops; - set->queue_depth = nctrl->sqsize + 1; - set->reserved_tags = NVMF_RESERVED_TAGS; - set->numa_node = nctrl->numa_node; - set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; - set->cmd_size = sizeof(struct nvme_tcp_request); - set->driver_data = ctrl; - set->nr_hw_queues = nctrl->queue_count - 1; - set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; - ret = blk_mq_alloc_tag_set(set); - if (!ret) - nctrl->tagset = set; - return ret; -} - static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) { if (to_tcp_ctrl(ctrl)->async_req.pdu) { @@ -1759,11 +1711,12 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) nvme_tcp_stop_queue(ctrl, i); } -static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) +static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, + int first, int last) { int i, ret; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = first; i < last; i++) { ret = nvme_tcp_start_queue(ctrl, i); if (ret) goto out_stop_queues; @@ -1772,7 +1725,7 @@ static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) return 0; out_stop_queues: - for (i--; i >= 1; i--) + for (i--; i >= first; i--) nvme_tcp_stop_queue(ctrl, i); return ret; } @@ -1781,7 +1734,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) { int ret; - ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); + ret = nvme_tcp_alloc_queue(ctrl, 0); if (ret) return ret; @@ -1801,7 +1754,7 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) int i, ret; for (i = 1; i < ctrl->queue_count; i++) { - ret = nvme_tcp_alloc_queue(ctrl, i, ctrl->sqsize + 1); + ret = nvme_tcp_alloc_queue(ctrl, i); if (ret) goto out_free_queues; } @@ -1889,32 +1842,35 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) { nvme_tcp_stop_io_queues(ctrl); - if (remove) { - blk_mq_destroy_queue(ctrl->connect_q); - blk_mq_free_tag_set(ctrl->tagset); - } + if (remove) + nvme_remove_io_tag_set(ctrl); nvme_tcp_free_io_queues(ctrl); } static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { - int ret; + int ret, nr_queues; ret = nvme_tcp_alloc_io_queues(ctrl); if (ret) return ret; if (new) { - ret = nvme_tcp_alloc_tag_set(ctrl); + ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, + &nvme_tcp_mq_ops, + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING, + sizeof(struct nvme_tcp_request)); if (ret) goto out_free_io_queues; - - ret = nvme_ctrl_init_connect_q(ctrl); - if (ret) - goto out_free_tag_set; } - ret = nvme_tcp_start_io_queues(ctrl); + /* + * Only start IO queues for which we have allocated the tagset + * and limitted it to the available queues. On reconnects, the + * queue number might have changed. + */ + nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count); + ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues); if (ret) goto out_cleanup_connect_q; @@ -1934,6 +1890,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) nvme_unfreeze(ctrl); } + /* + * If the number of queues has increased (reconnect case) + * start all new queues now. + */ + ret = nvme_tcp_start_io_queues(ctrl, nr_queues, + ctrl->tagset->nr_hw_queues + 1); + if (ret) + goto out_wait_freeze_timed_out; + return 0; out_wait_freeze_timed_out: @@ -1943,10 +1908,7 @@ out_wait_freeze_timed_out: out_cleanup_connect_q: nvme_cancel_tagset(ctrl); if (new) - blk_mq_destroy_queue(ctrl->connect_q); -out_free_tag_set: - if (new) - blk_mq_free_tag_set(ctrl->tagset); + nvme_remove_io_tag_set(ctrl); out_free_io_queues: nvme_tcp_free_io_queues(ctrl); return ret; @@ -1955,11 +1917,8 @@ out_free_io_queues: static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) { nvme_tcp_stop_queue(ctrl, 0); - if (remove) { - blk_mq_destroy_queue(ctrl->admin_q); - blk_mq_destroy_queue(ctrl->fabrics_q); - blk_mq_free_tag_set(ctrl->admin_tagset); - } + if (remove) + nvme_remove_admin_tag_set(ctrl); nvme_tcp_free_admin_queue(ctrl); } @@ -1972,26 +1931,17 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) return error; if (new) { - error = nvme_tcp_alloc_admin_tag_set(ctrl); + error = nvme_alloc_admin_tag_set(ctrl, + &to_tcp_ctrl(ctrl)->admin_tag_set, + &nvme_tcp_admin_mq_ops, BLK_MQ_F_BLOCKING, + sizeof(struct nvme_tcp_request)); if (error) goto out_free_queue; - - ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); - if (IS_ERR(ctrl->fabrics_q)) { - error = PTR_ERR(ctrl->fabrics_q); - goto out_free_tagset; - } - - ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); - if (IS_ERR(ctrl->admin_q)) { - error = PTR_ERR(ctrl->admin_q); - goto out_cleanup_fabrics_q; - } } error = nvme_tcp_start_queue(ctrl, 0); if (error) - goto out_cleanup_queue; + goto out_cleanup_tagset; error = nvme_enable_ctrl(ctrl); if (error) @@ -2011,15 +1961,9 @@ out_quiesce_queue: out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); -out_cleanup_queue: - if (new) - blk_mq_destroy_queue(ctrl->admin_q); -out_cleanup_fabrics_q: +out_cleanup_tagset: if (new) - blk_mq_destroy_queue(ctrl->fabrics_q); -out_free_tagset: - if (new) - blk_mq_free_tag_set(ctrl->admin_tagset); + nvme_remove_admin_tag_set(ctrl); out_free_queue: nvme_tcp_free_admin_queue(ctrl); return error; @@ -2468,9 +2412,9 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) +static void nvme_tcp_map_queues(struct blk_mq_tag_set *set) { - struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { @@ -2509,8 +2453,6 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) ctrl->io_queues[HCTX_TYPE_DEFAULT], ctrl->io_queues[HCTX_TYPE_READ], ctrl->io_queues[HCTX_TYPE_POLL]); - - return 0; } static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) @@ -2529,6 +2471,25 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) return queue->nr_cqe; } +static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0]; + struct sockaddr_storage src_addr; + int ret, len; + + len = nvmf_get_address(ctrl, buf, size); + + ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); + if (ret > 0) { + if (len > 0) + len--; /* strip trailing newline */ + len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n", + (len) ? "," : "", &src_addr); + } + + return len; +} + static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .commit_rqs = nvme_tcp_commit_rqs, @@ -2560,7 +2521,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .free_ctrl = nvme_tcp_free_ctrl, .submit_async_event = nvme_tcp_submit_async_event, .delete_ctrl = nvme_tcp_delete_ctrl, - .get_address = nvmf_get_address, + .get_address = nvme_tcp_get_address, .stop_ctrl = nvme_tcp_stop_ctrl, }; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index fc8a957fad0a..c8a061ce3ee5 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -449,7 +449,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); - strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); + strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); /* * Max command capsule size is sqe + in-capsule data size. diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 2bcd60758919..e34a2896fedb 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1281,6 +1281,34 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item, CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable); #endif +static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->max_qid); +} + +static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, + const char *page, size_t cnt) +{ + struct nvmet_port *port = to_nvmet_port(item); + u16 qid_max; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + if (sscanf(page, "%hu\n", &qid_max) != 1) + return -EINVAL; + + if (qid_max < 1 || qid_max > NVMET_NR_QUEUES) + return -EINVAL; + + down_write(&nvmet_config_sem); + to_subsys(item)->max_qid = qid_max; + up_write(&nvmet_config_sem); + return cnt; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_version, @@ -1288,6 +1316,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_cntlid_min, &nvmet_subsys_attr_attr_cntlid_max, &nvmet_subsys_attr_attr_model, + &nvmet_subsys_attr_attr_qid_max, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7f4083cf953a..14677145bbba 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -832,6 +832,7 @@ int nvmet_sq_init(struct nvmet_sq *sq) } init_completion(&sq->free_done); init_completion(&sq->confirm_done); + nvmet_auth_sq_init(sq); return 0; } diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index c2162eef8ce1..668d257fa986 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -292,7 +292,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL); - strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); + strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index ebdf9aa81041..7970a7640e58 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -23,17 +23,12 @@ static void nvmet_auth_expired_work(struct work_struct *work) sq->dhchap_tid = -1; } -void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +void nvmet_auth_sq_init(struct nvmet_sq *sq) { - u32 result = le32_to_cpu(req->cqe->result.u32); - /* Initialize in-band authentication */ - INIT_DELAYED_WORK(&req->sq->auth_expired_work, - nvmet_auth_expired_work); - req->sq->authenticated = false; - req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; - result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16; - req->cqe->result.u32 = cpu_to_le32(result); + INIT_DELAYED_WORK(&sq->auth_expired_work, nvmet_auth_expired_work); + sq->authenticated = false; + sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; } static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d) @@ -177,7 +172,7 @@ static u16 nvmet_auth_reply(struct nvmet_req *req, void *d) return 0; } -static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d) +static u16 nvmet_auth_failure2(void *d) { struct nvmf_auth_dhchap_failure_data *data = d; @@ -229,10 +224,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req) } status = nvmet_copy_from_sgl(req, 0, d, tl); - if (status) { - kfree(d); - goto done; - } + if (status) + goto done_kfree; data = d; pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__, @@ -310,7 +303,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req) goto done_kfree; break; case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2: - status = nvmet_auth_failure2(req, d); + status = nvmet_auth_failure2(d); if (status) { pr_warn("ctrl %d qid %d: authentication failed (%d)\n", ctrl->cntlid, req->sq->qid, status); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index f91a56180d3d..43b5bd8bb6a5 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -198,6 +198,12 @@ err: return ret; } +static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl) +{ + return (u32)ctrl->cntlid | + (nvmet_has_auth(ctrl) ? NVME_CONNECT_AUTHREQ_ATR : 0); +} + static void nvmet_execute_admin_connect(struct nvmet_req *req) { struct nvmf_connect_command *c = &req->cmd->connect; @@ -269,10 +275,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, ctrl->pi_support ? " T10-PI is enabled" : "", nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); - - if (nvmet_has_auth(ctrl)) - nvmet_init_auth(ctrl, req); + req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); out: kfree(d); complete: @@ -328,14 +331,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) if (status) goto out_ctrl_put; - /* pass back cntlid for successful completion */ - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); - pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); - if (nvmet_has_auth(ctrl)) - nvmet_init_auth(ctrl, req); - + req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); out: kfree(d); complete: diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 2dc1c1035626..c2d6cea0236b 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -12,11 +12,9 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) { - const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; - /* Number of logical blocks per physical block. */ - const u32 lpp = ql->physical_block_size / ql->logical_block_size; /* Logical blocks per physical block, 0's based. */ - const __le16 lpp0b = to0based(lpp); + const __le16 lpp0b = to0based(bdev_physical_block_size(bdev) / + bdev_logical_block_size(bdev)); /* * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, @@ -42,11 +40,12 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) /* NPWA = Namespace Preferred Write Alignment. 0's based */ id->npwa = id->npwg; /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ - id->npdg = to0based(ql->discard_granularity / ql->logical_block_size); + id->npdg = to0based(bdev_discard_granularity(bdev) / + bdev_logical_block_size(bdev)); /* NPDG = Namespace Preferred Deallocate Alignment */ id->npda = id->npdg; /* NOWS = Namespace Optimal Write Size */ - id->nows = to0based(ql->io_opt / ql->logical_block_size); + id->nows = to0based(bdev_io_opt(bdev) / bdev_logical_block_size(bdev)); } void nvmet_bdev_ns_disable(struct nvmet_ns *ns) @@ -334,6 +333,11 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) { struct bio *bio = &req->b.inline_bio; + if (!bdev_write_cache(req->ns->bdev)) { + nvmet_req_complete(req, NVME_SC_SUCCESS); + return; + } + if (!nvmet_check_transfer_len(req, 0)) return; @@ -347,6 +351,9 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req) { + if (!bdev_write_cache(req->ns->bdev)) + return 0; + if (blkdev_issue_flush(req->ns->bdev)) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 9750a7fca268..b45fe3adf015 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -204,7 +204,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_loop_ctrl *ctrl = set->driver_data; + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(set->driver_data); struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); nvme_req(req)->ctrl = &ctrl->ctrl; @@ -218,7 +218,7 @@ static struct lock_class_key loop_hctx_fq_lock_key; static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(data); struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); @@ -238,7 +238,7 @@ static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(data); struct nvme_loop_queue *queue = &ctrl->queues[0]; BUG_ON(hctx_idx != 0); @@ -266,9 +266,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) return; nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); - blk_mq_destroy_queue(ctrl->ctrl.admin_q); - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); - blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvme_remove_admin_tag_set(&ctrl->ctrl); } static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) @@ -282,10 +280,8 @@ static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) list_del(&ctrl->list); mutex_unlock(&nvme_loop_ctrl_mutex); - if (nctrl->tagset) { - blk_mq_destroy_queue(ctrl->ctrl.connect_q); - blk_mq_free_tag_set(&ctrl->tag_set); - } + if (nctrl->tagset) + nvme_remove_io_tag_set(nctrl); kfree(ctrl->queues); nvmf_free_options(nctrl->opts); free_ctrl: @@ -350,52 +346,31 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) { int error; - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); - ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; - ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS; - ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; - ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + - NVME_INLINE_SG_CNT * sizeof(struct scatterlist); - ctrl->admin_tag_set.driver_data = ctrl; - ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; - ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; - ctrl->queues[0].ctrl = ctrl; error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); if (error) return error; ctrl->ctrl.queue_count = 1; - error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, + &nvme_loop_admin_mq_ops, BLK_MQ_F_NO_SCHED, + sizeof(struct nvme_loop_iod) + + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (error) goto out_free_sq; - ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; - ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.fabrics_q)) { - error = PTR_ERR(ctrl->ctrl.fabrics_q); - goto out_free_tagset; - } - - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.admin_q)) { - error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_cleanup_fabrics_q; - } /* reset stopped state for the fresh admin queue */ clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); error = nvmf_connect_admin_queue(&ctrl->ctrl); if (error) - goto out_cleanup_queue; + goto out_cleanup_tagset; set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); error = nvme_enable_ctrl(&ctrl->ctrl); if (error) - goto out_cleanup_queue; + goto out_cleanup_tagset; ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); @@ -404,17 +379,13 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) - goto out_cleanup_queue; + goto out_cleanup_tagset; return 0; -out_cleanup_queue: +out_cleanup_tagset: clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); - blk_mq_destroy_queue(ctrl->ctrl.admin_q); -out_cleanup_fabrics_q: - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); -out_free_tagset: - blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_sq: nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); return error; @@ -522,37 +493,21 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) if (ret) return ret; - memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); - ctrl->tag_set.ops = &nvme_loop_mq_ops; - ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; - ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; - ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; - ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + - NVME_INLINE_SG_CNT * sizeof(struct scatterlist); - ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; - ctrl->tag_set.timeout = NVME_IO_TIMEOUT; - ctrl->ctrl.tagset = &ctrl->tag_set; - - ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, + &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE, + sizeof(struct nvme_loop_iod) + + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (ret) goto out_destroy_queues; - ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); - if (ret) - goto out_free_tagset; - ret = nvme_loop_connect_io_queues(ctrl); if (ret) - goto out_cleanup_connect_q; + goto out_cleanup_tagset; return 0; -out_cleanup_connect_q: - blk_mq_destroy_queue(ctrl->ctrl.connect_q); -out_free_tagset: - blk_mq_free_tag_set(&ctrl->tag_set); +out_cleanup_tagset: + nvme_remove_io_tag_set(&ctrl->ctrl); out_destroy_queues: nvme_loop_destroy_io_queues(ctrl); return ret; @@ -601,7 +556,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ret = -ENOMEM; - ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ctrl->port = nvme_loop_find_port(&ctrl->ctrl); @@ -621,6 +575,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, opts->queue_size, ctrl->ctrl.maxcmd); opts->queue_size = ctrl->ctrl.maxcmd; } + ctrl->ctrl.sqsize = opts->queue_size - 1; if (opts->nr_io_queues) { ret = nvme_loop_create_io_queues(ctrl); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6ffeeb0a1c49..dfe3894205aa 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -704,7 +704,7 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, bool set_ctrl); int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); int nvmet_setup_auth(struct nvmet_ctrl *ctrl); -void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req); +void nvmet_auth_sq_init(struct nvmet_sq *sq); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_auth_sq_free(struct nvmet_sq *sq); int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id); @@ -726,8 +726,9 @@ static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl) { return 0; } -static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl, - struct nvmet_req *req) {}; +static inline void nvmet_auth_sq_init(struct nvmet_sq *sq) +{ +} static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {}; static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {}; static inline bool nvmet_check_auth_status(struct nvmet_req *req) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 6f39a29828b1..94d3153bae54 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -215,9 +215,11 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) { struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); struct request *rq = req->p.rq; + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + u32 effects; int status; - status = nvme_execute_passthru_rq(rq); + status = nvme_execute_passthru_rq(rq, &effects); if (status == NVME_SC_SUCCESS && req->cmd->common.opcode == nvme_admin_identify) { @@ -238,6 +240,9 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, status); blk_mq_free_request(rq); + + if (effects) + nvme_passthru_end(ctrl, effects, req->cmd, status); } static void nvmet_passthru_req_done(struct request *rq, diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index a3694a32f6d5..6c1476e086ef 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -77,9 +77,8 @@ struct nvmet_tcp_cmd { u32 pdu_len; u32 pdu_recv; int sg_idx; - int nr_mapped; struct msghdr recv_msg; - struct kvec *iov; + struct bio_vec *iov; u32 flags; struct list_head entry; @@ -165,9 +164,7 @@ static DEFINE_MUTEX(nvmet_tcp_queue_mutex); static struct workqueue_struct *nvmet_tcp_wq; static const struct nvmet_fabrics_ops nvmet_tcp_ops; static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); -static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); -static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd); static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) @@ -301,35 +298,21 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) { - WARN_ON(unlikely(cmd->nr_mapped > 0)); - kfree(cmd->iov); sgl_free(cmd->req.sg); cmd->iov = NULL; cmd->req.sg = NULL; } -static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) -{ - struct scatterlist *sg; - int i; - - sg = &cmd->req.sg[cmd->sg_idx]; - - for (i = 0; i < cmd->nr_mapped; i++) - kunmap(sg_page(&sg[i])); - - cmd->nr_mapped = 0; -} - -static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) +static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) { - struct kvec *iov = cmd->iov; + struct bio_vec *iov = cmd->iov; struct scatterlist *sg; u32 length, offset, sg_offset; + int nr_pages; length = cmd->pdu_len; - cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); + nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); offset = cmd->rbytes_done; cmd->sg_idx = offset / PAGE_SIZE; sg_offset = offset % PAGE_SIZE; @@ -338,8 +321,9 @@ static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) while (length) { u32 iov_len = min_t(u32, length, sg->length - sg_offset); - iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; - iov->iov_len = iov_len; + iov->bv_page = sg_page(sg); + iov->bv_len = sg->length; + iov->bv_offset = sg->offset + sg_offset; length -= iov_len; sg = sg_next(sg); @@ -347,8 +331,8 @@ static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) sg_offset = 0; } - iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, - cmd->nr_mapped, cmd->pdu_len); + iov_iter_bvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, + nr_pages, cmd->pdu_len); } static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) @@ -926,7 +910,7 @@ static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, } queue->rcv_state = NVMET_TCP_RECV_DATA; - nvmet_tcp_map_pdu_iovec(cmd); + nvmet_tcp_build_pdu_iovec(cmd); cmd->flags |= NVMET_TCP_F_INIT_FAILED; } @@ -935,10 +919,17 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; - if (likely(queue->nr_cmds)) + if (likely(queue->nr_cmds)) { + if (unlikely(data->ttag >= queue->nr_cmds)) { + pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n", + queue->idx, data->ttag, queue->nr_cmds); + nvmet_tcp_fatal_error(queue); + return -EPROTO; + } cmd = &queue->cmds[data->ttag]; - else + } else { cmd = &queue->connect; + } if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { pr_err("ttag %u unexpected data offset %u (expected %u)\n", @@ -952,7 +943,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) cmd->pdu_len = le32_to_cpu(data->data_length); cmd->pdu_recv = 0; - nvmet_tcp_map_pdu_iovec(cmd); + nvmet_tcp_build_pdu_iovec(cmd); queue->cmd = cmd; queue->rcv_state = NVMET_TCP_RECV_DATA; @@ -976,6 +967,13 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) return nvmet_tcp_handle_icreq(queue); } + if (unlikely(hdr->type == nvme_tcp_icreq)) { + pr_err("queue %d: received icreq pdu in state %d\n", + queue->idx, queue->state); + nvmet_tcp_fatal_error(queue); + return -EPROTO; + } + if (hdr->type == nvme_tcp_h2c_data) { ret = nvmet_tcp_handle_h2c_data_pdu(queue); if (unlikely(ret)) @@ -1021,7 +1019,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (nvmet_tcp_need_data_in(queue->cmd)) { if (nvmet_tcp_has_inline_data(queue->cmd)) { queue->rcv_state = NVMET_TCP_RECV_DATA; - nvmet_tcp_map_pdu_iovec(queue->cmd); + nvmet_tcp_build_pdu_iovec(queue->cmd); return 0; } /* send back R2T */ @@ -1141,7 +1139,6 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) cmd->rbytes_done += ret; } - nvmet_tcp_unmap_pdu_iovec(cmd); if (queue->data_digest) { nvmet_tcp_prep_recv_ddgst(cmd); return 0; @@ -1179,7 +1176,8 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) queue->idx, cmd->req.cmd->common.command_id, queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), le32_to_cpu(cmd->exp_ddgst)); - nvmet_tcp_finish_cmd(cmd); + nvmet_req_uninit(&cmd->req); + nvmet_tcp_free_cmd_buffers(cmd); nvmet_tcp_fatal_error(queue); ret = -EPROTO; goto out; @@ -1408,13 +1406,6 @@ static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) write_unlock_bh(&sock->sk->sk_callback_lock); } -static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) -{ - nvmet_req_uninit(&cmd->req); - nvmet_tcp_unmap_pdu_iovec(cmd); - nvmet_tcp_free_cmd_buffers(cmd); -} - static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmds; @@ -1423,15 +1414,26 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) for (i = 0; i < queue->nr_cmds; i++, cmd++) { if (nvmet_tcp_need_data_in(cmd)) nvmet_req_uninit(&cmd->req); - - nvmet_tcp_unmap_pdu_iovec(cmd); - nvmet_tcp_free_cmd_buffers(cmd); } if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { /* failed in connect */ - nvmet_tcp_finish_cmd(&queue->connect); + nvmet_req_uninit(&queue->connect.req); + } +} + +static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmds; + int i; + + for (i = 0; i < queue->nr_cmds; i++, cmd++) { + if (nvmet_tcp_need_data_in(cmd)) + nvmet_tcp_free_cmd_buffers(cmd); } + + if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) + nvmet_tcp_free_cmd_buffers(&queue->connect); } static void nvmet_tcp_release_queue_work(struct work_struct *w) @@ -1452,6 +1454,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) nvmet_tcp_uninit_data_in_cmds(queue); nvmet_sq_destroy(&queue->nvme_sq); cancel_work_sync(&queue->io_work); + nvmet_tcp_free_cmd_data_in_buffers(queue); sock_release(queue->sock); nvmet_tcp_free_cmds(queue); if (queue->hdr_digest || queue->data_digest) diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 835bfda86fcf..1254cf57e008 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -400,7 +400,6 @@ static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req) { struct block_device *bdev = req->ns->bdev; unsigned int nr_zones = bdev_nr_zones(bdev); - struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = NULL; sector_t sector = 0; int ret; @@ -409,7 +408,7 @@ static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req) }; d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)), - GFP_NOIO, q->node); + GFP_NOIO, bdev->bd_disk->node_id); if (!d.zbitmap) { ret = -ENOMEM; goto out; diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index ea82821599f6..5a6d9c15395f 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -41,15 +41,6 @@ #define DASD_DIAG_MOD "dasd_diag_mod" -static unsigned int queue_depth = 32; -static unsigned int nr_hw_queues = 4; - -module_param(queue_depth, uint, 0444); -MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices"); - -module_param(nr_hw_queues, uint, 0444); -MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices"); - /* * SECTION: exported variables of dasd.c */ @@ -68,8 +59,6 @@ MODULE_LICENSE("GPL"); /* * SECTION: prototypes for static functions of dasd.c */ -static int dasd_alloc_queue(struct dasd_block *); -static void dasd_free_queue(struct dasd_block *); static int dasd_flush_block_queue(struct dasd_block *); static void dasd_device_tasklet(unsigned long); static void dasd_block_tasklet(unsigned long); @@ -198,21 +187,11 @@ EXPORT_SYMBOL_GPL(dasd_free_block); */ static int dasd_state_new_to_known(struct dasd_device *device) { - int rc; - /* * As long as the device is not in state DASD_STATE_NEW we want to * keep the reference count > 0. */ dasd_get_device(device); - - if (device->block) { - rc = dasd_alloc_queue(device->block); - if (rc) { - dasd_put_device(device); - return rc; - } - } device->state = DASD_STATE_KNOWN; return 0; } @@ -226,9 +205,6 @@ static int dasd_state_known_to_new(struct dasd_device *device) dasd_eer_disable(device); device->state = DASD_STATE_NEW; - if (device->block) - dasd_free_queue(device->block); - /* Give up reference we took in dasd_state_new_to_known. */ dasd_put_device(device); return 0; @@ -1591,9 +1567,8 @@ void dasd_generic_handle_state_change(struct dasd_device *device) dasd_schedule_device_bh(device); if (device->block) { dasd_schedule_block_bh(device->block); - if (device->block->request_queue) - blk_mq_run_hw_queues(device->block->request_queue, - true); + if (device->block->gdp) + blk_mq_run_hw_queues(device->block->gdp->queue, true); } } EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change); @@ -2691,7 +2666,7 @@ static void dasd_block_timeout(struct timer_list *t) dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING); spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); dasd_schedule_block_bh(block); - blk_mq_run_hw_queues(block->request_queue, true); + blk_mq_run_hw_queues(block->gdp->queue, true); } /* @@ -3239,7 +3214,7 @@ static void dasd_request_done(struct request *req) blk_mq_run_hw_queues(req->q, true); } -static struct blk_mq_ops dasd_mq_ops = { +struct blk_mq_ops dasd_mq_ops = { .queue_rq = do_dasd_request, .complete = dasd_request_done, .timeout = dasd_times_out, @@ -3247,45 +3222,6 @@ static struct blk_mq_ops dasd_mq_ops = { .exit_hctx = dasd_exit_hctx, }; -/* - * Allocate and initialize request queue and default I/O scheduler. - */ -static int dasd_alloc_queue(struct dasd_block *block) -{ - int rc; - - block->tag_set.ops = &dasd_mq_ops; - block->tag_set.cmd_size = sizeof(struct dasd_ccw_req); - block->tag_set.nr_hw_queues = nr_hw_queues; - block->tag_set.queue_depth = queue_depth; - block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - block->tag_set.numa_node = NUMA_NO_NODE; - - rc = blk_mq_alloc_tag_set(&block->tag_set); - if (rc) - return rc; - - block->request_queue = blk_mq_init_queue(&block->tag_set); - if (IS_ERR(block->request_queue)) - return PTR_ERR(block->request_queue); - - block->request_queue->queuedata = block; - - return 0; -} - -/* - * Deactivate and free request queue. - */ -static void dasd_free_queue(struct dasd_block *block) -{ - if (block->request_queue) { - blk_mq_destroy_queue(block->request_queue); - blk_mq_free_tag_set(&block->tag_set); - block->request_queue = NULL; - } -} - static int dasd_open(struct block_device *bdev, fmode_t mode) { struct dasd_device *base; @@ -3762,10 +3698,9 @@ int dasd_generic_path_operational(struct dasd_device *device) dasd_schedule_device_bh(device); if (device->block) { dasd_schedule_block_bh(device->block); - if (device->block->request_queue) - blk_mq_run_hw_queues(device->block->request_queue, - true); - } + if (device->block->gdp) + blk_mq_run_hw_queues(device->block->gdp->queue, true); + } if (!device->stopped) wake_up(&generic_waitq); @@ -3916,8 +3851,8 @@ void dasd_generic_space_avail(struct dasd_device *device) if (device->block) { dasd_schedule_block_bh(device->block); - if (device->block->request_queue) - blk_mq_run_hw_queues(device->block->request_queue, true); + if (device->block->gdp) + blk_mq_run_hw_queues(device->block->gdp->queue, true); } if (!device->stopped) wake_up(&generic_waitq); @@ -3927,7 +3862,7 @@ EXPORT_SYMBOL_GPL(dasd_generic_space_avail); /* * clear active requests and requeue them to block layer if possible */ -static int dasd_generic_requeue_all_requests(struct dasd_device *device) +int dasd_generic_requeue_all_requests(struct dasd_device *device) { struct list_head requeue_queue; struct dasd_ccw_req *cqr, *n; @@ -4001,6 +3936,7 @@ static int dasd_generic_requeue_all_requests(struct dasd_device *device) dasd_schedule_device_bh(device); return rc; } +EXPORT_SYMBOL_GPL(dasd_generic_requeue_all_requests); static void do_requeue_requests(struct work_struct *work) { diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index 299001ad9a32..81d283b3cd3b 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -1050,6 +1050,11 @@ dasd_3990_erp_com_rej(struct dasd_ccw_req * erp, char *sense) dev_err(&device->cdev->dev, "An I/O request was rejected" " because writing is inhibited\n"); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); + } else if (sense[7] & SNS7_INVALID_ON_SEC) { + dev_err(&device->cdev->dev, "An I/O request was rejected on a copy pair secondary device\n"); + /* suppress dump of sense data for this error */ + set_bit(DASD_CQR_SUPPRESS_CR, &erp->refers->flags); + erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); } else { /* fatal error - set status to FAILED internal error 09 - Command Reject */ diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 811e79c9f59c..1beb596d1434 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -26,7 +26,6 @@ /* This is ugly... */ #define PRINTK_HEADER "dasd_devmap:" -#define DASD_BUS_ID_SIZE 20 #define DASD_MAX_PARAMS 256 #include "dasd_int.h" @@ -50,6 +49,7 @@ struct dasd_devmap { unsigned int devindex; unsigned short features; struct dasd_device *device; + struct dasd_copy_relation *copy; }; /* @@ -130,7 +130,7 @@ __setup ("dasd=", dasd_call_setup); /* * Read a device busid/devno from a string. */ -static int __init dasd_busid(char *str, int *id0, int *id1, int *devno) +static int dasd_busid(char *str, int *id0, int *id1, int *devno) { unsigned int val; char *tok; @@ -438,16 +438,12 @@ dasd_add_busid(const char *bus_id, int features) return devmap; } -/* - * Find devmap for device with given bus_id. - */ static struct dasd_devmap * -dasd_find_busid(const char *bus_id) +dasd_find_busid_locked(const char *bus_id) { struct dasd_devmap *devmap, *tmp; int hash; - spin_lock(&dasd_devmap_lock); devmap = ERR_PTR(-ENODEV); hash = dasd_hash_busid(bus_id); list_for_each_entry(tmp, &dasd_hashlists[hash], list) { @@ -456,6 +452,19 @@ dasd_find_busid(const char *bus_id) break; } } + return devmap; +} + +/* + * Find devmap for device with given bus_id. + */ +static struct dasd_devmap * +dasd_find_busid(const char *bus_id) +{ + struct dasd_devmap *devmap; + + spin_lock(&dasd_devmap_lock); + devmap = dasd_find_busid_locked(bus_id); spin_unlock(&dasd_devmap_lock); return devmap; } @@ -585,6 +594,238 @@ dasd_create_device(struct ccw_device *cdev) } /* + * allocate a PPRC data structure and call the discipline function to fill + */ +static int dasd_devmap_get_pprc_status(struct dasd_device *device, + struct dasd_pprc_data_sc4 **data) +{ + struct dasd_pprc_data_sc4 *temp; + + if (!device->discipline || !device->discipline->pprc_status) { + dev_warn(&device->cdev->dev, "Unable to query copy relation status\n"); + return -EOPNOTSUPP; + } + temp = kzalloc(sizeof(*temp), GFP_KERNEL); + if (!temp) + return -ENOMEM; + + /* get PPRC information from storage */ + if (device->discipline->pprc_status(device, temp)) { + dev_warn(&device->cdev->dev, "Error during copy relation status query\n"); + kfree(temp); + return -EINVAL; + } + *data = temp; + + return 0; +} + +/* + * find an entry in a PPRC device_info array by a given UID + * depending on the primary/secondary state of the device it has to be + * matched with the respective fields + */ +static int dasd_devmap_entry_from_pprc_data(struct dasd_pprc_data_sc4 *data, + struct dasd_uid uid, + bool primary) +{ + int i; + + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (primary) { + if (data->dev_info[i].prim_cu_ssid == uid.ssid && + data->dev_info[i].primary == uid.real_unit_addr) + return i; + } else { + if (data->dev_info[i].sec_cu_ssid == uid.ssid && + data->dev_info[i].secondary == uid.real_unit_addr) + return i; + } + } + return -1; +} + +/* + * check the consistency of a specified copy relation by checking + * the following things: + * + * - is the given device part of a copy pair setup + * - does the state of the device match the state in the PPRC status data + * - does the device UID match with the UID in the PPRC status data + * - to prevent misrouted IO check if the given device is present in all + * related PPRC status data + */ +static int dasd_devmap_check_copy_relation(struct dasd_device *device, + struct dasd_copy_entry *entry, + struct dasd_pprc_data_sc4 *data, + struct dasd_copy_relation *copy) +{ + struct dasd_pprc_data_sc4 *tmp_dat; + struct dasd_device *tmp_dev; + struct dasd_uid uid; + int i, j; + + if (!device->discipline || !device->discipline->get_uid || + device->discipline->get_uid(device, &uid)) + return 1; + + i = dasd_devmap_entry_from_pprc_data(data, uid, entry->primary); + if (i < 0) { + dev_warn(&device->cdev->dev, "Device not part of a copy relation\n"); + return 1; + } + + /* double check which role the current device has */ + if (entry->primary) { + if (data->dev_info[i].flags & 0x80) { + dev_warn(&device->cdev->dev, "Copy pair secondary is setup as primary\n"); + return 1; + } + if (data->dev_info[i].prim_cu_ssid != uid.ssid || + data->dev_info[i].primary != uid.real_unit_addr) { + dev_warn(&device->cdev->dev, + "Primary device %s does not match copy pair status primary device %04x\n", + dev_name(&device->cdev->dev), + data->dev_info[i].prim_cu_ssid | + data->dev_info[i].primary); + return 1; + } + } else { + if (!(data->dev_info[i].flags & 0x80)) { + dev_warn(&device->cdev->dev, "Copy pair primary is setup as secondary\n"); + return 1; + } + if (data->dev_info[i].sec_cu_ssid != uid.ssid || + data->dev_info[i].secondary != uid.real_unit_addr) { + dev_warn(&device->cdev->dev, + "Secondary device %s does not match copy pair status secondary device %04x\n", + dev_name(&device->cdev->dev), + data->dev_info[i].sec_cu_ssid | + data->dev_info[i].secondary); + return 1; + } + } + + /* + * the current device has to be part of the copy relation of all + * entries to prevent misrouted IO to another copy pair + */ + for (j = 0; j < DASD_CP_ENTRIES; j++) { + if (entry == ©->entry[j]) + tmp_dev = device; + else + tmp_dev = copy->entry[j].device; + + if (!tmp_dev) + continue; + + if (dasd_devmap_get_pprc_status(tmp_dev, &tmp_dat)) + return 1; + + if (dasd_devmap_entry_from_pprc_data(tmp_dat, uid, entry->primary) < 0) { + dev_warn(&tmp_dev->cdev->dev, + "Copy pair relation does not contain device: %s\n", + dev_name(&device->cdev->dev)); + kfree(tmp_dat); + return 1; + } + kfree(tmp_dat); + } + return 0; +} + +/* delete device from copy relation entry */ +static void dasd_devmap_delete_copy_relation_device(struct dasd_device *device) +{ + struct dasd_copy_relation *copy; + int i; + + if (!device->copy) + return; + + copy = device->copy; + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].device == device) + copy->entry[i].device = NULL; + } + dasd_put_device(device); + device->copy = NULL; +} + +/* + * read all required information for a copy relation setup and setup the device + * accordingly + */ +int dasd_devmap_set_device_copy_relation(struct ccw_device *cdev, + bool pprc_enabled) +{ + struct dasd_pprc_data_sc4 *data = NULL; + struct dasd_copy_entry *entry = NULL; + struct dasd_copy_relation *copy; + struct dasd_devmap *devmap; + struct dasd_device *device; + int i, rc = 0; + + devmap = dasd_devmap_from_cdev(cdev); + if (IS_ERR(devmap)) + return PTR_ERR(devmap); + + device = devmap->device; + if (!device) + return -ENODEV; + + copy = devmap->copy; + /* no copy pair setup for this device */ + if (!copy) + goto out; + + rc = dasd_devmap_get_pprc_status(device, &data); + if (rc) + return rc; + + /* print error if PPRC is requested but not enabled on storage server */ + if (!pprc_enabled) { + dev_err(&cdev->dev, "Copy relation not enabled on storage server\n"); + rc = -EINVAL; + goto out; + } + + if (!data->dev_info[0].state) { + dev_warn(&device->cdev->dev, "Copy pair setup requested for device not in copy relation\n"); + rc = -EINVAL; + goto out; + } + /* find entry */ + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].configured && + strncmp(dev_name(&cdev->dev), + copy->entry[i].busid, DASD_BUS_ID_SIZE) == 0) { + entry = ©->entry[i]; + break; + } + } + if (!entry) { + dev_warn(&device->cdev->dev, "Copy relation entry not found\n"); + rc = -EINVAL; + goto out; + } + /* check if the copy relation is valid */ + if (dasd_devmap_check_copy_relation(device, entry, data, copy)) { + dev_warn(&device->cdev->dev, "Copy relation faulty\n"); + rc = -EINVAL; + goto out; + } + + dasd_get_device(device); + copy->entry[i].device = device; + device->copy = copy; +out: + kfree(data); + return rc; +} +EXPORT_SYMBOL_GPL(dasd_devmap_set_device_copy_relation); + +/* * Wait queue for dasd_delete_device waits. */ static DECLARE_WAIT_QUEUE_HEAD(dasd_delete_wq); @@ -617,6 +858,8 @@ dasd_delete_device(struct dasd_device *device) dev_set_drvdata(&device->cdev->dev, NULL); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); + /* Removve copy relation */ + dasd_devmap_delete_copy_relation_device(device); /* * Drop ref_count by 3, one for the devmap reference, one for * the cdev reference and one for the passed reference. @@ -694,6 +937,7 @@ void dasd_add_link_to_gendisk(struct gendisk *gdp, struct dasd_device *device) gdp->private_data = devmap; spin_unlock(&dasd_devmap_lock); } +EXPORT_SYMBOL(dasd_add_link_to_gendisk); struct dasd_device *dasd_device_from_gendisk(struct gendisk *gdp) { @@ -1334,7 +1578,6 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct dasd_device *device; - struct request_queue *q; unsigned long val; device = dasd_device_from_cdev(to_ccwdev(dev)); @@ -1346,15 +1589,13 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr, dasd_put_device(device); return -EINVAL; } - q = device->block->request_queue; - if (!q) { + if (!device->block->gdp) { dasd_put_device(device); return -ENODEV; } device->blk_timeout = val; - - blk_queue_rq_timeout(q, device->blk_timeout * HZ); + blk_queue_rq_timeout(device->block->gdp->queue, val * HZ); dasd_put_device(device); return count; @@ -1683,6 +1924,347 @@ dasd_path_fcs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) static struct kobj_attribute path_fcs_attribute = __ATTR(fc_security, 0444, dasd_path_fcs_show, NULL); +/* + * print copy relation in the form + * primary,secondary[1] primary,secondary[2], ... + */ +static ssize_t +dasd_copy_pair_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + char prim_busid[DASD_BUS_ID_SIZE]; + struct dasd_copy_relation *copy; + struct dasd_devmap *devmap; + int len = 0; + int i; + + devmap = dasd_find_busid(dev_name(dev)); + if (IS_ERR(devmap)) + return -ENODEV; + + if (!devmap->copy) + return -ENODEV; + + copy = devmap->copy; + /* find primary */ + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].configured && copy->entry[i].primary) { + strscpy(prim_busid, copy->entry[i].busid, + DASD_BUS_ID_SIZE); + break; + } + } + if (!copy->entry[i].primary) + goto out; + + /* print all secondary */ + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].configured && !copy->entry[i].primary) + len += sysfs_emit_at(buf, len, "%s,%s ", prim_busid, + copy->entry[i].busid); + } + + len += sysfs_emit_at(buf, len, "\n"); +out: + return len; +} + +static int dasd_devmap_set_copy_relation(struct dasd_devmap *devmap, + struct dasd_copy_relation *copy, + char *busid, bool primary) +{ + int i; + + /* find free entry */ + for (i = 0; i < DASD_CP_ENTRIES; i++) { + /* current bus_id already included, nothing to do */ + if (copy->entry[i].configured && + strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0) + return 0; + + if (!copy->entry[i].configured) + break; + } + if (i == DASD_CP_ENTRIES) + return -EINVAL; + + copy->entry[i].configured = true; + strscpy(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE); + if (primary) { + copy->active = ©->entry[i]; + copy->entry[i].primary = true; + } + if (!devmap->copy) + devmap->copy = copy; + + return 0; +} + +static void dasd_devmap_del_copy_relation(struct dasd_copy_relation *copy, + char *busid) +{ + int i; + + spin_lock(&dasd_devmap_lock); + /* find entry */ + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].configured && + strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0) + break; + } + if (i == DASD_CP_ENTRIES || !copy->entry[i].configured) { + spin_unlock(&dasd_devmap_lock); + return; + } + + copy->entry[i].configured = false; + memset(copy->entry[i].busid, 0, DASD_BUS_ID_SIZE); + if (copy->active == ©->entry[i]) { + copy->active = NULL; + copy->entry[i].primary = false; + } + spin_unlock(&dasd_devmap_lock); +} + +static int dasd_devmap_clear_copy_relation(struct device *dev) +{ + struct dasd_copy_relation *copy; + struct dasd_devmap *devmap; + int i, rc = 1; + + devmap = dasd_devmap_from_cdev(to_ccwdev(dev)); + if (IS_ERR(devmap)) + return 1; + + spin_lock(&dasd_devmap_lock); + if (!devmap->copy) + goto out; + + copy = devmap->copy; + /* first check if all secondary devices are offline*/ + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (!copy->entry[i].configured) + continue; + + if (copy->entry[i].device == copy->active->device) + continue; + + if (copy->entry[i].device) + goto out; + } + /* clear all devmap entries */ + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (strlen(copy->entry[i].busid) == 0) + continue; + if (copy->entry[i].device) { + dasd_put_device(copy->entry[i].device); + copy->entry[i].device->copy = NULL; + copy->entry[i].device = NULL; + } + devmap = dasd_find_busid_locked(copy->entry[i].busid); + devmap->copy = NULL; + memset(copy->entry[i].busid, 0, DASD_BUS_ID_SIZE); + } + kfree(copy); + rc = 0; +out: + spin_unlock(&dasd_devmap_lock); + return rc; +} + +/* + * parse BUSIDs from a copy pair + */ +static int dasd_devmap_parse_busid(const char *buf, char *prim_busid, + char *sec_busid) +{ + char *primary, *secondary, *tmp, *pt; + int id0, id1, id2; + + pt = kstrdup(buf, GFP_KERNEL); + tmp = pt; + if (!tmp) + return -ENOMEM; + + primary = strsep(&tmp, ","); + if (!primary) { + kfree(pt); + return -EINVAL; + } + secondary = strsep(&tmp, ","); + if (!secondary) { + kfree(pt); + return -EINVAL; + } + if (dasd_busid(primary, &id0, &id1, &id2)) { + kfree(pt); + return -EINVAL; + } + sprintf(prim_busid, "%01x.%01x.%04x", id0, id1, id2); + if (dasd_busid(secondary, &id0, &id1, &id2)) { + kfree(pt); + return -EINVAL; + } + sprintf(sec_busid, "%01x.%01x.%04x", id0, id1, id2); + kfree(pt); + + return 0; +} + +static ssize_t dasd_copy_pair_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct dasd_devmap *prim_devmap, *sec_devmap; + char prim_busid[DASD_BUS_ID_SIZE]; + char sec_busid[DASD_BUS_ID_SIZE]; + struct dasd_copy_relation *copy; + struct dasd_device *device; + bool pprc_enabled; + int rc; + + if (strncmp(buf, "clear", strlen("clear")) == 0) { + if (dasd_devmap_clear_copy_relation(dev)) + return -EINVAL; + return count; + } + + rc = dasd_devmap_parse_busid(buf, prim_busid, sec_busid); + if (rc) + return rc; + + if (strncmp(dev_name(dev), prim_busid, DASD_BUS_ID_SIZE) != 0 && + strncmp(dev_name(dev), sec_busid, DASD_BUS_ID_SIZE) != 0) + return -EINVAL; + + /* allocate primary devmap if needed */ + prim_devmap = dasd_find_busid(prim_busid); + if (IS_ERR(prim_devmap)) + prim_devmap = dasd_add_busid(prim_busid, DASD_FEATURE_DEFAULT); + + /* allocate secondary devmap if needed */ + sec_devmap = dasd_find_busid(sec_busid); + if (IS_ERR(sec_devmap)) + sec_devmap = dasd_add_busid(sec_busid, DASD_FEATURE_DEFAULT); + + /* setting copy relation is only allowed for offline secondary */ + if (sec_devmap->device) + return -EINVAL; + + if (prim_devmap->copy) { + copy = prim_devmap->copy; + } else if (sec_devmap->copy) { + copy = sec_devmap->copy; + } else { + copy = kzalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) + return -ENOMEM; + } + spin_lock(&dasd_devmap_lock); + rc = dasd_devmap_set_copy_relation(prim_devmap, copy, prim_busid, true); + if (rc) { + spin_unlock(&dasd_devmap_lock); + return rc; + } + rc = dasd_devmap_set_copy_relation(sec_devmap, copy, sec_busid, false); + if (rc) { + spin_unlock(&dasd_devmap_lock); + return rc; + } + spin_unlock(&dasd_devmap_lock); + + /* if primary device is already online call device setup directly */ + if (prim_devmap->device && !prim_devmap->device->copy) { + device = prim_devmap->device; + if (device->discipline->pprc_enabled) { + pprc_enabled = device->discipline->pprc_enabled(device); + rc = dasd_devmap_set_device_copy_relation(device->cdev, + pprc_enabled); + } else { + rc = -EOPNOTSUPP; + } + } + if (rc) { + dasd_devmap_del_copy_relation(copy, prim_busid); + dasd_devmap_del_copy_relation(copy, sec_busid); + count = rc; + } + + return count; +} +static DEVICE_ATTR(copy_pair, 0644, dasd_copy_pair_show, + dasd_copy_pair_store); + +static ssize_t +dasd_copy_role_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dasd_copy_relation *copy; + struct dasd_device *device; + int len, i; + + device = dasd_device_from_cdev(to_ccwdev(dev)); + if (IS_ERR(device)) + return -ENODEV; + + if (!device->copy) { + len = sysfs_emit(buf, "none\n"); + goto out; + } + copy = device->copy; + /* only the active device is primary */ + if (copy->active->device == device) { + len = sysfs_emit(buf, "primary\n"); + goto out; + } + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].device == device) { + len = sysfs_emit(buf, "secondary\n"); + goto out; + } + } + /* not in the list, no COPY role */ + len = sysfs_emit(buf, "none\n"); +out: + dasd_put_device(device); + return len; +} +static DEVICE_ATTR(copy_role, 0444, dasd_copy_role_show, NULL); + +static ssize_t dasd_device_ping(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct dasd_device *device; + size_t rc; + + device = dasd_device_from_cdev(to_ccwdev(dev)); + if (IS_ERR(device)) + return -ENODEV; + + /* + * do not try during offline processing + * early check only + * the sleep_on function itself checks for offline + * processing again + */ + if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) { + rc = -EBUSY; + goto out; + } + if (!device->discipline || !device->discipline->device_ping) { + rc = -EOPNOTSUPP; + goto out; + } + rc = device->discipline->device_ping(device); + if (!rc) + rc = count; +out: + dasd_put_device(device); + return rc; +} +static DEVICE_ATTR(ping, 0200, NULL, dasd_device_ping); + #define DASD_DEFINE_ATTR(_name, _func) \ static ssize_t dasd_##_name##_show(struct device *dev, \ struct device_attribute *attr, \ @@ -1739,6 +2321,9 @@ static struct attribute * dasd_attrs[] = { &dev_attr_hpf.attr, &dev_attr_ese.attr, &dev_attr_fc_security.attr, + &dev_attr_copy_pair.attr, + &dev_attr_copy_role.attr, + &dev_attr_ping.attr, NULL, }; diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 94ee59864971..f956a4ac9881 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -627,7 +627,7 @@ dasd_diag_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, static void dasd_diag_setup_blk_queue(struct dasd_block *block) { unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->request_queue; + struct request_queue *q = block->gdp->queue; int max; max = DIAG_MAX_BLOCKS << block->s2b_shift; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 3cc93e2e4e15..662730f3b027 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2013,6 +2013,49 @@ static void dasd_eckd_kick_validate_server(struct dasd_device *device) } /* + * return if the device is the copy relation primary if a copy relation is active + */ +static int dasd_device_is_primary(struct dasd_device *device) +{ + if (!device->copy) + return 1; + + if (device->copy->active->device == device) + return 1; + + return 0; +} + +static int dasd_eckd_alloc_block(struct dasd_device *device) +{ + struct dasd_block *block; + struct dasd_uid temp_uid; + + if (!dasd_device_is_primary(device)) + return 0; + + dasd_eckd_get_uid(device, &temp_uid); + if (temp_uid.type == UA_BASE_DEVICE) { + block = dasd_alloc_block(); + if (IS_ERR(block)) { + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", + "could not allocate dasd block structure"); + return PTR_ERR(block); + } + device->block = block; + block->base = device; + } + return 0; +} + +static bool dasd_eckd_pprc_enabled(struct dasd_device *device) +{ + struct dasd_eckd_private *private = device->private; + + return private->rdc_data.facilities.PPRC_enabled; +} + +/* * Check device characteristics. * If the device is accessible using ECKD discipline, the device is enabled. */ @@ -2020,8 +2063,6 @@ static int dasd_eckd_check_characteristics(struct dasd_device *device) { struct dasd_eckd_private *private = device->private; - struct dasd_block *block; - struct dasd_uid temp_uid; int rc, i; int readonly; unsigned long value; @@ -2079,20 +2120,29 @@ dasd_eckd_check_characteristics(struct dasd_device *device) device->default_expires = value; } - dasd_eckd_get_uid(device, &temp_uid); - if (temp_uid.type == UA_BASE_DEVICE) { - block = dasd_alloc_block(); - if (IS_ERR(block)) { - DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", - "could not allocate dasd " - "block structure"); - rc = PTR_ERR(block); - goto out_err1; - } - device->block = block; - block->base = device; + /* Read Device Characteristics */ + rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, + &private->rdc_data, 64); + if (rc) { + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, + "Read device characteristic failed, rc=%d", rc); + goto out_err1; + } + + /* setup PPRC for device from devmap */ + rc = dasd_devmap_set_device_copy_relation(device->cdev, + dasd_eckd_pprc_enabled(device)); + if (rc) { + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, + "copy relation setup failed, rc=%d", rc); + goto out_err1; } + /* check if block device is needed and allocate in case */ + rc = dasd_eckd_alloc_block(device); + if (rc) + goto out_err1; + /* register lcu with alias handling, enable PAV */ rc = dasd_alias_make_device_known_to_lcu(device); if (rc) @@ -2117,15 +2167,6 @@ dasd_eckd_check_characteristics(struct dasd_device *device) /* Read Extent Pool Information */ dasd_eckd_read_ext_pool_info(device); - /* Read Device Characteristics */ - rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, - &private->rdc_data, 64); - if (rc) { - DBF_EVENT_DEVID(DBF_WARNING, device->cdev, - "Read device characteristic failed, rc=%d", rc); - goto out_err3; - } - if ((device->features & DASD_FEATURE_USERAW) && !(private->rdc_data.facilities.RT_in_LR)) { dev_err(&device->cdev->dev, "The storage server does not " @@ -6078,6 +6119,207 @@ static int dasd_hosts_print(struct dasd_device *device, struct seq_file *m) return 0; } +static struct dasd_device +*copy_relation_find_device(struct dasd_copy_relation *copy, + char *busid) +{ + int i; + + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].configured && + strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0) + return copy->entry[i].device; + } + return NULL; +} + +/* + * set the new active/primary device + */ +static void copy_pair_set_active(struct dasd_copy_relation *copy, char *new_busid, + char *old_busid) +{ + int i; + + for (i = 0; i < DASD_CP_ENTRIES; i++) { + if (copy->entry[i].configured && + strncmp(copy->entry[i].busid, new_busid, + DASD_BUS_ID_SIZE) == 0) { + copy->active = ©->entry[i]; + copy->entry[i].primary = true; + } else if (copy->entry[i].configured && + strncmp(copy->entry[i].busid, old_busid, + DASD_BUS_ID_SIZE) == 0) { + copy->entry[i].primary = false; + } + } +} + +/* + * The function will swap the role of a given copy pair. + * During the swap operation the relation of the blockdevice is disconnected + * from the old primary and connected to the new. + * + * IO is paused on the block queue before swap and may be resumed afterwards. + */ +static int dasd_eckd_copy_pair_swap(struct dasd_device *device, char *prim_busid, + char *sec_busid) +{ + struct dasd_device *primary, *secondary; + struct dasd_copy_relation *copy; + struct dasd_block *block; + struct gendisk *gdp; + + copy = device->copy; + if (!copy) + return DASD_COPYPAIRSWAP_INVALID; + primary = copy->active->device; + if (!primary) + return DASD_COPYPAIRSWAP_INVALID; + /* double check if swap has correct primary */ + if (strncmp(dev_name(&primary->cdev->dev), prim_busid, DASD_BUS_ID_SIZE) != 0) + return DASD_COPYPAIRSWAP_PRIMARY; + + secondary = copy_relation_find_device(copy, sec_busid); + if (!secondary) + return DASD_COPYPAIRSWAP_SECONDARY; + + /* + * usually the device should be quiesced for swap + * for paranoia stop device and requeue requests again + */ + dasd_device_set_stop_bits(primary, DASD_STOPPED_PPRC); + dasd_device_set_stop_bits(secondary, DASD_STOPPED_PPRC); + dasd_generic_requeue_all_requests(primary); + + /* swap DASD internal device <> block assignment */ + block = primary->block; + primary->block = NULL; + secondary->block = block; + block->base = secondary; + /* set new primary device in COPY relation */ + copy_pair_set_active(copy, sec_busid, prim_busid); + + /* swap blocklayer device link */ + gdp = block->gdp; + dasd_add_link_to_gendisk(gdp, secondary); + + /* re-enable device */ + dasd_device_remove_stop_bits(primary, DASD_STOPPED_PPRC); + dasd_device_remove_stop_bits(secondary, DASD_STOPPED_PPRC); + dasd_schedule_device_bh(secondary); + + return DASD_COPYPAIRSWAP_SUCCESS; +} + +/* + * Perform Subsystem Function - Peer-to-Peer Remote Copy Extended Query + */ +static int dasd_eckd_query_pprc_status(struct dasd_device *device, + struct dasd_pprc_data_sc4 *data) +{ + struct dasd_pprc_data_sc4 *pprc_data; + struct dasd_psf_prssd_data *prssdp; + struct dasd_ccw_req *cqr; + struct ccw1 *ccw; + int rc; + + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, + sizeof(*prssdp) + sizeof(*pprc_data) + 1, + device, NULL); + if (IS_ERR(cqr)) { + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", + "Could not allocate query PPRC status request"); + return PTR_ERR(cqr); + } + cqr->startdev = device; + cqr->memdev = device; + cqr->block = NULL; + cqr->retries = 256; + cqr->expires = 10 * HZ; + + /* Prepare for Read Subsystem Data */ + prssdp = (struct dasd_psf_prssd_data *)cqr->data; + memset(prssdp, 0, sizeof(struct dasd_psf_prssd_data)); + prssdp->order = PSF_ORDER_PRSSD; + prssdp->suborder = PSF_SUBORDER_PPRCEQ; + prssdp->varies[0] = PPRCEQ_SCOPE_4; + pprc_data = (struct dasd_pprc_data_sc4 *)(prssdp + 1); + + ccw = cqr->cpaddr; + ccw->cmd_code = DASD_ECKD_CCW_PSF; + ccw->count = sizeof(struct dasd_psf_prssd_data); + ccw->flags |= CCW_FLAG_CC; + ccw->flags |= CCW_FLAG_SLI; + ccw->cda = (__u32)(addr_t)prssdp; + + /* Read Subsystem Data - query host access */ + ccw++; + ccw->cmd_code = DASD_ECKD_CCW_RSSD; + ccw->count = sizeof(*pprc_data); + ccw->flags |= CCW_FLAG_SLI; + ccw->cda = (__u32)(addr_t)pprc_data; + + cqr->buildclk = get_tod_clock(); + cqr->status = DASD_CQR_FILLED; + + rc = dasd_sleep_on_interruptible(cqr); + if (rc == 0) { + *data = *pprc_data; + } else { + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, + "PPRC Extended Query failed with rc=%d\n", + rc); + rc = -EOPNOTSUPP; + } + + dasd_sfree_request(cqr, cqr->memdev); + return rc; +} + +/* + * ECKD NOP - no operation + */ +static int dasd_eckd_nop(struct dasd_device *device) +{ + struct dasd_ccw_req *cqr; + struct ccw1 *ccw; + int rc; + + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 1, device, NULL); + if (IS_ERR(cqr)) { + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", + "Could not allocate NOP request"); + return PTR_ERR(cqr); + } + cqr->startdev = device; + cqr->memdev = device; + cqr->block = NULL; + cqr->retries = 1; + cqr->expires = 10 * HZ; + + ccw = cqr->cpaddr; + ccw->cmd_code = DASD_ECKD_CCW_NOP; + ccw->flags |= CCW_FLAG_SLI; + + cqr->buildclk = get_tod_clock(); + cqr->status = DASD_CQR_FILLED; + + rc = dasd_sleep_on_interruptible(cqr); + if (rc != 0) { + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, + "NOP failed with rc=%d\n", rc); + rc = -EOPNOTSUPP; + } + dasd_sfree_request(cqr, cqr->memdev); + return rc; +} + +static int dasd_eckd_device_ping(struct dasd_device *device) +{ + return dasd_eckd_nop(device); +} + /* * Perform Subsystem Function - CUIR response */ @@ -6602,7 +6844,7 @@ static void dasd_eckd_handle_hpf_error(struct dasd_device *device, static void dasd_eckd_setup_blk_queue(struct dasd_block *block) { unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->request_queue; + struct request_queue *q = block->gdp->queue; struct dasd_device *device = block->base; int max; @@ -6697,6 +6939,10 @@ static struct dasd_discipline dasd_eckd_discipline = { .ext_pool_exhaust = dasd_eckd_ext_pool_exhaust, .ese_format = dasd_eckd_ese_format, .ese_read = dasd_eckd_ese_read, + .pprc_status = dasd_eckd_query_pprc_status, + .pprc_enabled = dasd_eckd_pprc_enabled, + .copy_pair_swap = dasd_eckd_copy_pair_swap, + .device_ping = dasd_eckd_device_ping, }; static int __init diff --git a/drivers/s390/block/dasd_eckd.h b/drivers/s390/block/dasd_eckd.h index a91b265441cc..f9299bd184ba 100644 --- a/drivers/s390/block/dasd_eckd.h +++ b/drivers/s390/block/dasd_eckd.h @@ -13,6 +13,7 @@ /***************************************************************************** * SECTION: CCW Definitions ****************************************************************************/ +#define DASD_ECKD_CCW_NOP 0x03 #define DASD_ECKD_CCW_WRITE 0x05 #define DASD_ECKD_CCW_READ 0x06 #define DASD_ECKD_CCW_WRITE_HOME_ADDRESS 0x09 @@ -66,10 +67,16 @@ * Perform Subsystem Function / Sub-Orders */ #define PSF_SUBORDER_QHA 0x1C /* Query Host Access */ +#define PSF_SUBORDER_PPRCEQ 0x50 /* PPRC Extended Query */ #define PSF_SUBORDER_VSQ 0x52 /* Volume Storage Query */ #define PSF_SUBORDER_LCQ 0x53 /* Logical Configuration Query */ /* + * PPRC Extended Query Scopes + */ +#define PPRCEQ_SCOPE_4 0x04 /* Scope 4 for PPRC Extended Query */ + +/* * CUIR response condition codes */ #define PSF_CUIR_INVALID 0x00 @@ -261,7 +268,7 @@ struct dasd_eckd_characteristics { unsigned char reserved3:8; unsigned char defect_wr:1; unsigned char XRC_supported:1; - unsigned char reserved4:1; + unsigned char PPRC_enabled:1; unsigned char striping:1; unsigned char reserved5:4; unsigned char cfw:1; diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 60be7f7bf2d1..cddfb01a3dca 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -767,7 +767,7 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, static void dasd_fba_setup_blk_queue(struct dasd_block *block) { unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->request_queue; + struct request_queue *q = block->gdp->queue; unsigned int max_bytes, max_discard_sectors; int max; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 5a83f0a39901..998a961e1704 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -25,7 +25,14 @@ #include "dasd_int.h" -static struct lock_class_key dasd_bio_compl_lkclass; +static unsigned int queue_depth = 32; +static unsigned int nr_hw_queues = 4; + +module_param(queue_depth, uint, 0444); +MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices"); + +module_param(nr_hw_queues, uint, 0444); +MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices"); /* * Allocate and register gendisk structure for device. @@ -41,10 +48,21 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = blk_mq_alloc_disk_for_queue(block->request_queue, - &dasd_bio_compl_lkclass); - if (!gdp) - return -ENOMEM; + block->tag_set.ops = &dasd_mq_ops; + block->tag_set.cmd_size = sizeof(struct dasd_ccw_req); + block->tag_set.nr_hw_queues = nr_hw_queues; + block->tag_set.queue_depth = queue_depth; + block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + block->tag_set.numa_node = NUMA_NO_NODE; + rc = blk_mq_alloc_tag_set(&block->tag_set); + if (rc) + return rc; + + gdp = blk_mq_alloc_disk(&block->tag_set, block); + if (IS_ERR(gdp)) { + blk_mq_free_tag_set(&block->tag_set); + return PTR_ERR(gdp); + } /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; @@ -100,6 +118,7 @@ void dasd_gendisk_free(struct dasd_block *block) block->gdp->private_data = NULL; put_disk(block->gdp); block->gdp = NULL; + blk_mq_free_tag_set(&block->tag_set); } } diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 333a399f754e..97adc8a7ae6b 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -260,6 +260,55 @@ struct dasd_uid { }; /* + * PPRC Status data + */ +struct dasd_pprc_header { + __u8 entries; /* 0 Number of device entries */ + __u8 unused; /* 1 unused */ + __u16 entry_length; /* 2-3 Length of device entry */ + __u32 unused2; /* 4-7 unused */ +} __packed; + +struct dasd_pprc_dev_info { + __u8 state; /* 0 Copy State */ + __u8 flags; /* 1 Flags */ + __u8 reserved1[2]; /* 2-3 reserved */ + __u8 prim_lss; /* 4 Primary device LSS */ + __u8 primary; /* 5 Primary device address */ + __u8 sec_lss; /* 6 Secondary device LSS */ + __u8 secondary; /* 7 Secondary device address */ + __u16 pprc_id; /* 8-9 Peer-to-Peer Remote Copy ID */ + __u8 reserved2[12]; /* 10-21 reserved */ + __u16 prim_cu_ssid; /* 22-23 Pimary Control Unit SSID */ + __u8 reserved3[12]; /* 24-35 reserved */ + __u16 sec_cu_ssid; /* 36-37 Secondary Control Unit SSID */ + __u8 reserved4[90]; /* 38-127 reserved */ +} __packed; + +struct dasd_pprc_data_sc4 { + struct dasd_pprc_header header; + struct dasd_pprc_dev_info dev_info[5]; +} __packed; + +#define DASD_BUS_ID_SIZE 20 +#define DASD_CP_ENTRIES 5 + +struct dasd_copy_entry { + char busid[DASD_BUS_ID_SIZE]; + struct dasd_device *device; + bool primary; + bool configured; +}; + +struct dasd_copy_relation { + struct dasd_copy_entry entry[DASD_CP_ENTRIES]; + struct dasd_copy_entry *active; +}; + +int dasd_devmap_set_device_copy_relation(struct ccw_device *, + bool pprc_enabled); + +/* * the struct dasd_discipline is * sth like a table of virtual functions, if you think of dasd_eckd * inheriting dasd... @@ -387,6 +436,10 @@ struct dasd_discipline { struct dasd_ccw_req *(*ese_format)(struct dasd_device *, struct dasd_ccw_req *, struct irb *); int (*ese_read)(struct dasd_ccw_req *, struct irb *); + int (*pprc_status)(struct dasd_device *, struct dasd_pprc_data_sc4 *); + bool (*pprc_enabled)(struct dasd_device *); + int (*copy_pair_swap)(struct dasd_device *, char *, char *); + int (*device_ping)(struct dasd_device *); }; extern struct dasd_discipline *dasd_diag_discipline_pointer; @@ -583,12 +636,12 @@ struct dasd_device { struct dasd_profile profile; struct dasd_format_entry format_entry; struct kset *paths_info; + struct dasd_copy_relation *copy; }; struct dasd_block { /* Block device stuff. */ struct gendisk *gdp; - struct request_queue *request_queue; spinlock_t request_queue_lock; struct blk_mq_tag_set tag_set; struct block_device *bdev; @@ -629,6 +682,7 @@ struct dasd_queue { #define DASD_STOPPED_PENDING 4 /* long busy */ #define DASD_STOPPED_DC_WAIT 8 /* disconnected, wait */ #define DASD_STOPPED_SU 16 /* summary unit check handling */ +#define DASD_STOPPED_PPRC 32 /* PPRC swap */ #define DASD_STOPPED_NOSPC 128 /* no space left */ /* per device flags */ @@ -654,6 +708,22 @@ struct dasd_queue { void dasd_put_device_wake(struct dasd_device *); /* + * return values to be returned from the copy pair swap function + * 0x00: swap successful + * 0x01: swap data invalid + * 0x02: no active device found + * 0x03: wrong primary specified + * 0x04: secondary device not found + * 0x05: swap already running + */ +#define DASD_COPYPAIRSWAP_SUCCESS 0 +#define DASD_COPYPAIRSWAP_INVALID 1 +#define DASD_COPYPAIRSWAP_NOACTIVE 2 +#define DASD_COPYPAIRSWAP_PRIMARY 3 +#define DASD_COPYPAIRSWAP_SECONDARY 4 +#define DASD_COPYPAIRSWAP_MULTIPLE 5 + +/* * Reference count inliners */ static inline void @@ -779,6 +849,7 @@ extern debug_info_t *dasd_debug_area; extern struct dasd_profile dasd_global_profile; extern unsigned int dasd_global_profile_level; extern const struct block_device_operations dasd_device_operations; +extern struct blk_mq_ops dasd_mq_ops; extern struct kmem_cache *dasd_page_cache; @@ -837,6 +908,8 @@ int dasd_generic_verify_path(struct dasd_device *, __u8); void dasd_generic_space_exhaust(struct dasd_device *, struct dasd_ccw_req *); void dasd_generic_space_avail(struct dasd_device *); +int dasd_generic_requeue_all_requests(struct dasd_device *); + int dasd_generic_read_dev_chars(struct dasd_device *, int, void *, int); char *dasd_get_sense(struct irb *); diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 95349f95758c..d0ddf2cc9786 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -379,6 +379,56 @@ out_err: return rc; } +/* + * Swap driver iternal copy relation. + */ +static int +dasd_ioctl_copy_pair_swap(struct block_device *bdev, void __user *argp) +{ + struct dasd_copypair_swap_data_t data; + struct dasd_device *device; + int rc; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + device = dasd_device_from_gendisk(bdev->bd_disk); + if (!device) + return -ENODEV; + + if (copy_from_user(&data, argp, sizeof(struct dasd_copypair_swap_data_t))) { + dasd_put_device(device); + return -EFAULT; + } + if (memchr_inv(data.reserved, 0, sizeof(data.reserved))) { + pr_warn("%s: Ivalid swap data specified.\n", + dev_name(&device->cdev->dev)); + dasd_put_device(device); + return DASD_COPYPAIRSWAP_INVALID; + } + if (bdev_is_partition(bdev)) { + pr_warn("%s: The specified DASD is a partition and cannot be swapped\n", + dev_name(&device->cdev->dev)); + dasd_put_device(device); + return DASD_COPYPAIRSWAP_INVALID; + } + if (!device->copy) { + pr_warn("%s: The specified DASD has no copy pair set up\n", + dev_name(&device->cdev->dev)); + dasd_put_device(device); + return -ENODEV; + } + if (!device->discipline->copy_pair_swap) { + dasd_put_device(device); + return -EOPNOTSUPP; + } + rc = device->discipline->copy_pair_swap(device, data.primary, + data.secondary); + dasd_put_device(device); + + return rc; +} + #ifdef CONFIG_DASD_PROFILE /* * Reset device profile information @@ -637,6 +687,9 @@ int dasd_ioctl(struct block_device *bdev, fmode_t mode, case BIODASDRAS: rc = dasd_ioctl_release_space(bdev, argp); break; + case BIODASDCOPYPAIRSWAP: + rc = dasd_ioctl_copy_pair_swap(bdev, argp); + break; default: /* if the discipline has an ioctl method try it. */ rc = -ENOTTY; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index 70e401fd432a..c37027276162 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -3537,7 +3537,7 @@ static struct attribute *host_v2_hw_attrs[] = { ATTRIBUTE_GROUPS(host_v2_hw); -static int map_queues_v2_hw(struct Scsi_Host *shost) +static void map_queues_v2_hw(struct Scsi_Host *shost) { struct hisi_hba *hisi_hba = shost_priv(shost); struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; @@ -3552,9 +3552,6 @@ static int map_queues_v2_hw(struct Scsi_Host *shost) for_each_cpu(cpu, mask) qmap->mq_map[cpu] = qmap->queue_offset + queue; } - - return 0; - } static struct scsi_host_template sht_v2_hw = { diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index efe8c5be5870..d716e5632d0f 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3171,13 +3171,12 @@ static int debugfs_set_bist_v3_hw(struct hisi_hba *hisi_hba, bool enable) return 0; } -static int hisi_sas_map_queues(struct Scsi_Host *shost) +static void hisi_sas_map_queues(struct Scsi_Host *shost) { struct hisi_hba *hisi_hba = shost_priv(shost); struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; - return blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, - BASE_VECTORS_V3_HW); + blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, BASE_VECTORS_V3_HW); } static struct scsi_host_template sht_v3_hw = { diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index f6c37a97544e..4ae85d590801 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3174,7 +3174,7 @@ megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, return 0; } -static int megasas_map_queues(struct Scsi_Host *shost) +static void megasas_map_queues(struct Scsi_Host *shost) { struct megasas_instance *instance; int qoff = 0, offset; @@ -3183,7 +3183,7 @@ static int megasas_map_queues(struct Scsi_Host *shost) instance = (struct megasas_instance *)shost->hostdata; if (shost->nr_hw_queues == 1) - return 0; + return; offset = instance->low_latency_index_start; @@ -3209,8 +3209,6 @@ static int megasas_map_queues(struct Scsi_Host *shost) map->queue_offset = qoff; blk_mq_map_queues(map); } - - return 0; } static void megasas_aen_polling(struct work_struct *work); diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index bfa1165e23b6..9681c8bf24ed 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -3464,7 +3464,7 @@ static int mpi3mr_bios_param(struct scsi_device *sdev, * * Return: return zero. */ -static int mpi3mr_map_queues(struct Scsi_Host *shost) +static void mpi3mr_map_queues(struct Scsi_Host *shost) { struct mpi3mr_ioc *mrioc = shost_priv(shost); int i, qoff, offset; @@ -3500,9 +3500,6 @@ static int mpi3mr_map_queues(struct Scsi_Host *shost) qoff += map->nr_queues; offset += map->nr_queues; } - - return 0; - } /** diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index bd6a5f1bd532..791a406b4f8e 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -11872,7 +11872,7 @@ out: * scsih_map_queues - map reply queues with request queues * @shost: SCSI host pointer */ -static int scsih_map_queues(struct Scsi_Host *shost) +static void scsih_map_queues(struct Scsi_Host *shost) { struct MPT3SAS_ADAPTER *ioc = (struct MPT3SAS_ADAPTER *)shost->hostdata; @@ -11882,7 +11882,7 @@ static int scsih_map_queues(struct Scsi_Host *shost) int iopoll_q_count = ioc->reply_queue_count - nr_msix_vectors; if (shost->nr_hw_queues == 1) - return 0; + return; for (i = 0, qoff = 0; i < shost->nr_maps; i++) { map = &shost->tag_set.map[i]; @@ -11910,7 +11910,6 @@ static int scsih_map_queues(struct Scsi_Host *shost) qoff += map->nr_queues; } - return 0; } /* shost template for SAS 2.0 HBA devices */ diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index a0028e130a7e..2ff2fac1e403 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -81,7 +81,7 @@ LIST_HEAD(hba_list); struct workqueue_struct *pm8001_wq; -static int pm8001_map_queues(struct Scsi_Host *shost) +static void pm8001_map_queues(struct Scsi_Host *shost) { struct sas_ha_struct *sha = SHOST_TO_SAS_HA(shost); struct pm8001_hba_info *pm8001_ha = sha->lldd_ha; diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 7450c3458be7..02fdeb0d31ec 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -684,12 +684,8 @@ static void qla_nvme_map_queues(struct nvme_fc_local_port *lport, struct blk_mq_queue_map *map) { struct scsi_qla_host *vha = lport->private; - int rc; - rc = blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset); - if (rc) - ql_log(ql_log_warn, vha, 0x21de, - "pci map queue failed 0x%x", rc); + blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset); } static void qla_nvme_localport_delete(struct nvme_fc_local_port *lport) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 0bd0fd1042df..87a93892deac 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -350,7 +350,7 @@ MODULE_PARM_DESC(ql2xrspq_follow_inptr_legacy, static void qla2x00_clear_drv_active(struct qla_hw_data *); static void qla2x00_free_device(scsi_qla_host_t *); -static int qla2xxx_map_queues(struct Scsi_Host *shost); +static void qla2xxx_map_queues(struct Scsi_Host *shost); static void qla2x00_destroy_deferred_work(struct qla_hw_data *); u32 ql2xnvme_queues = DEF_NVME_HW_QUEUES; @@ -7994,17 +7994,15 @@ qla_pci_reset_done(struct pci_dev *pdev) clear_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags); } -static int qla2xxx_map_queues(struct Scsi_Host *shost) +static void qla2xxx_map_queues(struct Scsi_Host *shost) { - int rc; scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata; struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; if (USER_CTRL_IRQ(vha->hw) || !vha->hw->mqiobase) - rc = blk_mq_map_queues(qmap); + blk_mq_map_queues(qmap); else - rc = blk_mq_pci_map_queues(qmap, vha->hw->pdev, vha->irq_offset); - return rc; + blk_mq_pci_map_queues(qmap, vha->hw->pdev, vha->irq_offset); } struct scsi_host_template qla2xxx_driver_template = { diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index b8a76b89f85a..697fc57bc711 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -7474,12 +7474,12 @@ static int resp_not_ready(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) return check_condition_result; } -static int sdebug_map_queues(struct Scsi_Host *shost) +static void sdebug_map_queues(struct Scsi_Host *shost) { int i, qoff; if (shost->nr_hw_queues == 1) - return 0; + return; for (i = 0, qoff = 0; i < HCTX_MAX_TYPES; i++) { struct blk_mq_queue_map *map = &shost->tag_set.map[i]; @@ -7501,9 +7501,6 @@ static int sdebug_map_queues(struct Scsi_Host *shost) qoff += map->nr_queues; } - - return 0; - } static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 96e7e3eaca29..d7ec4ab2b111 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1856,13 +1856,13 @@ static int scsi_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, return 0; } -static int scsi_map_queues(struct blk_mq_tag_set *set) +static void scsi_map_queues(struct blk_mq_tag_set *set) { struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set); if (shost->hostt->map_queues) return shost->hostt->map_queues(shost); - return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 7a8c2c75acba..b971fbe3b3a1 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -6436,12 +6436,12 @@ static int pqi_slave_alloc(struct scsi_device *sdev) return 0; } -static int pqi_map_queues(struct Scsi_Host *shost) +static void pqi_map_queues(struct Scsi_Host *shost) { struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost); - return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], - ctrl_info->pci_dev, 0); + blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + ctrl_info->pci_dev, 0); } static inline bool pqi_is_tape_changer_device(struct pqi_scsi_dev *device) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 578c4b6d0f7d..077a8e24bd28 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -711,12 +711,12 @@ static int virtscsi_abort(struct scsi_cmnd *sc) return virtscsi_tmf(vscsi, cmd); } -static int virtscsi_map_queues(struct Scsi_Host *shost) +static void virtscsi_map_queues(struct Scsi_Host *shost) { struct virtio_scsi *vscsi = shost_priv(shost); struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; - return blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2); + blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2); } static void virtscsi_commit_rqs(struct Scsi_Host *shost, u16 hwq) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index a202d7d5240d..9538832b03a0 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -2701,9 +2701,9 @@ static inline bool is_device_wlun(struct scsi_device *sdev) * Associate the UFS controller queue with the default and poll HCTX types. * Initialize the mq_map[] arrays. */ -static int ufshcd_map_queues(struct Scsi_Host *shost) +static void ufshcd_map_queues(struct Scsi_Host *shost) { - int i, ret; + int i; for (i = 0; i < shost->nr_maps; i++) { struct blk_mq_queue_map *map = &shost->tag_set.map[i]; @@ -2720,11 +2720,8 @@ static int ufshcd_map_queues(struct Scsi_Host *shost) WARN_ON_ONCE(true); } map->queue_offset = 0; - ret = blk_mq_map_queues(map); - WARN_ON_ONCE(ret); + blk_mq_map_queues(map); } - - return 0; } static void ufshcd_init_lrb(struct ufs_hba *hba, struct ufshcd_lrb *lrb, int i) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 54caa00a2245..c0a08064b0a7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -15,6 +15,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/psi.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> @@ -511,7 +512,8 @@ static u64 bio_end_offset(struct bio *bio) */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, - struct compressed_bio *cb) + struct compressed_bio *cb, + unsigned long *pflags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; @@ -580,6 +582,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } + if (PageWorkingset(page)) + psi_memstall_enter(pflags); + ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -666,6 +671,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; + /* Initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; blk_status_t ret; int ret2; int i; @@ -721,7 +728,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail; } - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -801,6 +808,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } } + if (!pflags) + psi_memstall_leave(&pflags); + if (refcount_dec_and_test(&cb->pending_ios)) finish_compressed_bio_read(cb); return; diff --git a/fs/direct-io.c b/fs/direct-io.c index f669163d5860..03d381377ae1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -421,8 +421,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index cce56dde135c..559380a535af 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -7,6 +7,7 @@ #include "zdata.h" #include "compress.h" #include <linux/prefetch.h> +#include <linux/psi.h> #include <trace/events/erofs.h> @@ -1414,6 +1415,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; + /* initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1463,10 +1466,15 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, if (bio && (cur != last_index + 1 || last_bdev != mdev.m_bdev)) { submit_bio_retry: + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); bio = NULL; } + if (unlikely(PageWorkingset(page))) + psi_memstall_enter(&pflags); + if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); @@ -1494,8 +1502,11 @@ submit_bio_retry: move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); - if (bio) + if (bio) { + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); + } /* * although background is preferred, no one is pending for submission. diff --git a/include/linux/bio.h b/include/linux/bio.h index ca22b06700a9..2c5806997bbf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -509,7 +509,7 @@ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) - bio_clear_flag(bio, BIO_THROTTLED); + bio_clear_flag(bio, BIO_BPS_THROTTLED); bio->bi_bdev = bdev; bio_associate_blkg(bio); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9f40dbc65f82..dd5841a42c33 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -18,14 +18,14 @@ struct bio; struct cgroup_subsys_state; -struct request_queue; +struct gendisk; #define FC_APPID_LEN 129 #ifdef CONFIG_BLK_CGROUP extern struct cgroup_subsys_state * const blkcg_root_css; -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay); void blkcg_maybe_throttle_current(void); bool blk_cgroup_congested(void); void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); @@ -39,7 +39,6 @@ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio); static inline void blkcg_maybe_throttle_current(void) { } static inline bool blk_cgroup_congested(void) { return false; } -static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { return NULL; diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h index 0b1f45c62623..ca544e1d3508 100644 --- a/include/linux/blk-mq-pci.h +++ b/include/linux/blk-mq-pci.h @@ -5,7 +5,7 @@ struct blk_mq_queue_map; struct pci_dev; -int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset); +void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset); #endif /* _LINUX_BLK_MQ_PCI_H */ diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h index 5cc5f0f36218..53b58c610e76 100644 --- a/include/linux/blk-mq-rdma.h +++ b/include/linux/blk-mq-rdma.h @@ -5,7 +5,7 @@ struct blk_mq_tag_set; struct ib_device; -int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, +void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec); #endif /* _LINUX_BLK_MQ_RDMA_H */ diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h index 687ae287e1dc..13226e9b22dd 100644 --- a/include/linux/blk-mq-virtio.h +++ b/include/linux/blk-mq-virtio.h @@ -5,7 +5,7 @@ struct blk_mq_queue_map; struct virtio_device; -int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, +void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec); #endif /* _LINUX_BLK_MQ_VIRTIO_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index de384f5d2c37..00a15808c137 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -268,9 +268,16 @@ static inline void rq_list_move(struct request **src, struct request **dst, rq_list_add(dst, rq); } +/** + * enum blk_eh_timer_return - How the timeout handler should proceed + * @BLK_EH_DONE: The block driver completed the command or will complete it at + * a later time. + * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the + * request to complete. + */ enum blk_eh_timer_return { - BLK_EH_DONE, /* drivers has completed the command */ - BLK_EH_RESET_TIMER, /* reset timer and try again */ + BLK_EH_DONE, + BLK_EH_RESET_TIMER, }; #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ @@ -630,7 +637,7 @@ struct blk_mq_ops { * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ - int (*map_queues)(struct blk_mq_tag_set *set); + void (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** @@ -880,7 +887,7 @@ void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_map_queues(struct blk_mq_queue_map *qmap); +void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); @@ -963,11 +970,11 @@ blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; - int page_order; - int nr_entries; unsigned long offset; - int null_mapped; - int from_user; + unsigned short page_order; + unsigned short nr_entries; + bool null_mapped; + bool from_user; }; int blk_rq_map_user(struct request_queue *, struct request *, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1ef99790f6ed..e0b098089ef2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -321,11 +321,10 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ - BIO_THROTTLED, /* This bio has already been subjected to + BIO_BPS_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8038c5fbde40..3e187a02924f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -618,7 +618,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) -#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); @@ -1280,6 +1279,11 @@ static inline bool bdev_fua(struct block_device *bdev) return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); } +static inline bool bdev_nowait(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@ -1300,6 +1304,15 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } +static inline bool bdev_op_is_zoned_write(struct block_device *bdev, + blk_opf_t op) +{ + if (!bdev_is_zoned(bdev)) + return false; + + return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; +} + static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ae53d74f3696..050d7d0cd81b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1482,8 +1482,8 @@ struct nvmf_connect_command { }; enum { - NVME_CONNECT_AUTHREQ_ASCR = (1 << 2), - NVME_CONNECT_AUTHREQ_ATR = (1 << 1), + NVME_CONNECT_AUTHREQ_ASCR = (1U << 18), + NVME_CONNECT_AUTHREQ_ATR = (1U << 17), }; struct nvmf_connect_data { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0178b2040ea3..201dc7281640 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1173,6 +1173,8 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool _workingset; + unsigned long _pflags; }; #define DEFINE_READAHEAD(ractl, f, r, m, i) \ diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 8f5a86e210b9..4d2d5205ab58 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. + * @nr: Number of bits cleared. */ -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 1ac0d712a9c3..6f837bb6c715 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -43,6 +43,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_MBR_DONE: case IOC_OPAL_WRITE_SHADOW_MBR: case IOC_OPAL_GENERIC_TABLE_RW: + case IOC_OPAL_GET_STATUS: return true; } return false; diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 9b0a028bf053..fcf25f1642a3 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -276,7 +276,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - int (* map_queues)(struct Scsi_Host *shost); + void (* map_queues)(struct Scsi_Host *shost); /* * SCSI interface of blk_poll - poll for IO completions. diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 6f5af1a84213..2573772e2fb3 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -132,6 +132,18 @@ struct opal_read_write_table { __u64 priv; }; +#define OPAL_FL_SUPPORTED 0x00000001 +#define OPAL_FL_LOCKING_SUPPORTED 0x00000002 +#define OPAL_FL_LOCKING_ENABLED 0x00000004 +#define OPAL_FL_LOCKED 0x00000008 +#define OPAL_FL_MBR_ENABLED 0x00000010 +#define OPAL_FL_MBR_DONE 0x00000020 + +struct opal_status { + __u32 flags; + __u32 reserved; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -148,5 +160,6 @@ struct opal_read_write_table { #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) #define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) +#define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) #endif /* _UAPI_SED_OPAL_H */ diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 677edaab2b66..8f88e3a29998 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -17,7 +17,8 @@ #define UBLK_CMD_STOP_DEV 0x07 #define UBLK_CMD_SET_PARAMS 0x08 #define UBLK_CMD_GET_PARAMS 0x09 - +#define UBLK_CMD_START_USER_RECOVERY 0x10 +#define UBLK_CMD_END_USER_RECOVERY 0x11 /* * IO commands, issued by ublk server, and handled by ublk driver. * @@ -74,9 +75,14 @@ */ #define UBLK_F_NEED_GET_DATA (1UL << 2) +#define UBLK_F_USER_RECOVERY (1UL << 3) + +#define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 +#define UBLK_S_DEV_QUIESCED 2 /* shipped via sqe->cmd of io_uring command */ struct ublksrv_ctrl_cmd { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 00db7ad907b6..99a52f34b7d3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1540,7 +1540,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) static bool io_bdev_nowait(struct block_device *bdev) { - return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); + return !bdev || bdev_nowait(bdev); } /* diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ecb4b4ff4ce0..7f6030091aee 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -917,6 +917,7 @@ void psi_memstall_enter(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -946,6 +947,7 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 29eb0484215a..a8108a962dfd 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -533,21 +533,20 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, nr = find_first_zero_bit(&map->word, map_depth); if (nr + nr_tags <= map_depth) { atomic_long_t *ptr = (atomic_long_t *) &map->word; - int map_tags = min_t(int, nr_tags, map_depth); - unsigned long val, ret; + unsigned long val; - get_mask = ((1UL << map_tags) - 1) << nr; + get_mask = ((1UL << nr_tags) - 1) << nr; + val = READ_ONCE(map->word); do { - val = READ_ONCE(map->word); if ((val & ~get_mask) != val) goto next; - ret = atomic_long_cmpxchg(ptr, val, get_mask | val); - } while (ret != val); - get_mask = (get_mask & ~ret) >> nr; + } while (!atomic_long_try_cmpxchg(ptr, &val, + get_mask | val)); + get_mask = (get_mask & ~val) >> nr; if (get_mask) { *offset = nr + (index << sb->shift); update_alloc_hint_after_get(sb, depth, hint, - *offset + map_tags - 1); + *offset + nr_tags - 1); return get_mask; } } @@ -588,7 +587,7 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; - if (waitqueue_active(&ws->wait)) { + if (waitqueue_active(&ws->wait) && atomic_read(&ws->wait_cnt)) { if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); return ws; @@ -600,50 +599,82 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) return NULL; } -static bool __sbq_wake_up(struct sbitmap_queue *sbq) +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr) { struct sbq_wait_state *ws; unsigned int wake_batch; - int wait_cnt; + int wait_cnt, cur, sub; + bool ret; + + if (*nr <= 0) + return false; ws = sbq_wake_ptr(sbq); if (!ws) return false; - wait_cnt = atomic_dec_return(&ws->wait_cnt); - if (wait_cnt <= 0) { - int ret; - - wake_batch = READ_ONCE(sbq->wake_batch); - + cur = atomic_read(&ws->wait_cnt); + do { /* - * Pairs with the memory barrier in sbitmap_queue_resize() to - * ensure that we see the batch size update before the wait - * count is reset. + * For concurrent callers of this, callers should call this + * function again to wakeup a new batch on a different 'ws'. */ - smp_mb__before_atomic(); + if (cur == 0) + return true; + sub = min(*nr, cur); + wait_cnt = cur - sub; + } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt)); - /* - * For concurrent callers of this, the one that failed the - * atomic_cmpxhcg() race should call this function again - * to wakeup a new batch on a different 'ws'. - */ - ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch); - if (ret == wait_cnt) { - sbq_index_atomic_inc(&sbq->wake_index); - wake_up_nr(&ws->wait, wake_batch); - return false; - } + /* + * If we decremented queue without waiters, retry to avoid lost + * wakeups. + */ + if (wait_cnt > 0) + return !waitqueue_active(&ws->wait); - return true; - } + *nr -= sub; - return false; + /* + * When wait_cnt == 0, we have to be particularly careful as we are + * responsible to reset wait_cnt regardless whether we've actually + * woken up anybody. But in case we didn't wakeup anybody, we still + * need to retry. + */ + ret = !waitqueue_active(&ws->wait); + wake_batch = READ_ONCE(sbq->wake_batch); + + /* + * Wake up first in case that concurrent callers decrease wait_cnt + * while waitqueue is empty. + */ + wake_up_nr(&ws->wait, wake_batch); + + /* + * Pairs with the memory barrier in sbitmap_queue_resize() to + * ensure that we see the batch size update before the wait + * count is reset. + * + * Also pairs with the implicit barrier between decrementing wait_cnt + * and checking for waitqueue_active() to make sure waitqueue_active() + * sees result of the wakeup if atomic_dec_return() has seen the result + * of atomic_set(). + */ + smp_mb__before_atomic(); + + /* + * Increase wake_index before updating wait_cnt, otherwise concurrent + * callers can see valid wait_cnt in old waitqueue, which can cause + * invalid wakeup on the old waitqueue. + */ + sbq_index_atomic_inc(&sbq->wake_index); + atomic_set(&ws->wait_cnt, wake_batch); + + return ret || *nr; } -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq) +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { - while (__sbq_wake_up(sbq)) + while (__sbq_wake_up(sbq, &nr)) ; } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); @@ -683,7 +714,7 @@ void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, atomic_long_andnot(mask, (atomic_long_t *) addr); smp_mb__after_atomic(); - sbitmap_queue_wake_up(sbq); + sbitmap_queue_wake_up(sbq, nr_tags); sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), tags[nr_tags - 1] - offset); } @@ -711,7 +742,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, * waiter. See the comment on waitqueue_active(). */ smp_mb__after_atomic(); - sbitmap_queue_wake_up(sbq); + sbitmap_queue_wake_up(sbq, 1); sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); diff --git a/mm/filemap.c b/mm/filemap.c index 15800334147b..c943d1b90cc2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2382,6 +2382,8 @@ retry: static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { + bool workingset = folio_test_workingset(folio); + unsigned long pflags; int error; /* @@ -2390,8 +2392,13 @@ static int filemap_read_folio(struct file *file, filler_t filler, * fails. */ folio_clear_error(folio); + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); error = filler(file, folio); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (error) return error; diff --git a/mm/readahead.c b/mm/readahead.c index fdcd28cbd92d..b10f0cf81d80 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -122,6 +122,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> #include <linux/pagemap.h> +#include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> @@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; + if (unlikely(rac->_workingset)) + psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { @@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac) } blk_finish_plug(&plug); + if (unlikely(rac->_workingset)) + psi_memstall_leave(&rac->_pflags); + rac->_workingset = false; BUG_ON(readahead_count(rac)); } @@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); + ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } @@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); - if (err) + if (err) { folio_put(folio); - else - ractl->_nr_pages += 1UL << order; - return err; + return err; + } + + ractl->_nr_pages += 1UL << order; + ractl->_workingset |= folio_test_workingset(folio); + return 0; } void page_cache_ra_order(struct readahead_control *ractl, @@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl, put_page(page); return; } + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; if (ra) { ra->size++; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1fdccd2f1422..82e62007881d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3655,7 +3655,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); + blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } |