diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-22 11:32:05 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-22 11:32:05 -0700 |
commit | 7d080fa867092c1db078dd72d70cb256642f7b18 (patch) | |
tree | 6185f09f14510e8d733d05c68c0aa6fcde262cf0 | |
parent | 0256994887d7c89c2a41d872aac67605bda8f115 (diff) | |
parent | 89ed6c9ac69ec398ccb648f5f675b43e8ca679ca (diff) | |
download | linux-stable-7d080fa867092c1db078dd72d70cb256642f7b18.tar.gz linux-stable-7d080fa867092c1db078dd72d70cb256642f7b18.tar.bz2 linux-stable-7d080fa867092c1db078dd72d70cb256642f7b18.zip |
Merge tag 'for-6.11/block-20240722' of git://git.kernel.dk/linux
Pull more block updates from Jens Axboe:
- MD fixes via Song:
- md-cluster fixes (Heming Zhao)
- raid1 fix (Mateusz Jończyk)
- s390/dasd module description (Jeff)
- Series cleaning up and hardening the blk-mq debugfs flag handling
(John, Christoph)
- blk-cgroup cleanup (Xiu)
- Error polled IO attempts if backend doesn't support it (hexue)
- Fix for an sbitmap hang (Yang)
* tag 'for-6.11/block-20240722' of git://git.kernel.dk/linux: (23 commits)
blk-cgroup: move congestion_count to struct blkcg
sbitmap: fix io hung due to race on sbitmap_word::cleared
block: avoid polling configuration errors
block: Catch possible entries missing from rqf_name[]
block: Simplify definition of RQF_NAME()
block: Use enum to define RQF_x bit indexes
block: Catch possible entries missing from cmd_flag_name[]
block: Catch possible entries missing from alloc_policy_name[]
block: Catch possible entries missing from hctx_flag_name[]
block: Catch possible entries missing from hctx_state_name[]
block: Catch possible entries missing from blk_queue_flag_name[]
block: Make QUEUE_FLAG_x as an enum
block: Relocate BLK_MQ_MAX_DEPTH
block: Relocate BLK_MQ_CPU_WORK_BATCH
block: remove QUEUE_FLAG_STOPPED
block: Add missing entry to hctx_flag_name[]
block: Add zone write plugging entry to rqf_name[]
block: Add missing entries from cmd_flag_name[]
s390/dasd: fix error checks in dasd_copy_pair_store()
s390/dasd: add missing MODULE_DESCRIPTION() macros
...
-rw-r--r-- | block/blk-cgroup.c | 7 | ||||
-rw-r--r-- | block/blk-cgroup.h | 10 | ||||
-rw-r--r-- | block/blk-core.c | 5 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 26 | ||||
-rw-r--r-- | block/blk-mq.h | 2 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 49 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 17 | ||||
-rw-r--r-- | drivers/md/raid1.c | 1 | ||||
-rw-r--r-- | drivers/s390/block/dasd_devmap.c | 10 | ||||
-rw-r--r-- | drivers/s390/block/dasd_diag.c | 1 | ||||
-rw-r--r-- | drivers/s390/block/dasd_eckd.c | 1 | ||||
-rw-r--r-- | drivers/s390/block/dasd_fba.c | 1 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 127 | ||||
-rw-r--r-- | include/linux/blk_types.h | 1 | ||||
-rw-r--r-- | include/linux/blkdev.h | 31 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 3 | ||||
-rw-r--r-- | include/linux/sbitmap.h | 5 | ||||
-rw-r--r-- | lib/sbitmap.c | 36 |
19 files changed, 233 insertions, 102 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 37e6cc91d576..69e70964398c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2182,12 +2182,13 @@ void blk_cgroup_bio_start(struct bio *bio) bool blk_cgroup_congested(void) { - struct cgroup_subsys_state *css; + struct blkcg *blkcg; bool ret = false; rcu_read_lock(); - for (css = blkcg_css(); css; css = css->parent) { - if (atomic_read(&css->cgroup->congestion_count)) { + for (blkcg = css_to_blkcg(blkcg_css()); blkcg; + blkcg = blkcg_parent(blkcg)) { + if (atomic_read(&blkcg->congestion_count)) { ret = true; break; } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index bd472a30bc61..864fad4a850b 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -95,6 +95,8 @@ struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; + /* If there is block congestion on this cgroup. */ + atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; @@ -374,7 +376,7 @@ static inline void blkcg_use_delay(struct blkcg_gq *blkg) if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) - atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) @@ -399,7 +401,7 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) if (old == 0) return 0; if (old == 1) - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + atomic_dec(&blkg->blkcg->congestion_count); return 1; } @@ -418,7 +420,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) - atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } @@ -435,7 +437,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + atomic_dec(&blkg->blkcg->congestion_count); } /** diff --git a/block/blk-core.c b/block/blk-core.c index 02bceeb36f2c..1217c2cd66dd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -791,8 +791,11 @@ void submit_bio_noacct(struct bio *bio) } } - if (!(q->limits.features & BLK_FEAT_POLL)) + if (!(q->limits.features & BLK_FEAT_POLL) && + (bio->bi_opf & REQ_POLLED)) { bio_clear_polled(bio); + goto not_supported; + } switch (bio_op(bio)) { case REQ_OP_READ: diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 344f9e503bdb..5463697a8442 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -5,6 +5,7 @@ #include <linux/kernel.h> #include <linux/blkdev.h> +#include <linux/build_bug.h> #include <linux/debugfs.h> #include "blk.h" @@ -79,7 +80,6 @@ static int queue_pm_only_show(void *data, struct seq_file *m) #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { - QUEUE_FLAG_NAME(STOPPED), QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), @@ -100,6 +100,7 @@ static int queue_state_show(void *data, struct seq_file *m) { struct request_queue *q = data; + BUILD_BUG_ON(ARRAY_SIZE(blk_queue_flag_name) != QUEUE_FLAG_MAX); blk_flags_show(m, q->queue_flags, blk_queue_flag_name, ARRAY_SIZE(blk_queue_flag_name)); seq_puts(m, "\n"); @@ -164,6 +165,7 @@ static int hctx_state_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; + BUILD_BUG_ON(ARRAY_SIZE(hctx_state_name) != BLK_MQ_S_MAX); blk_flags_show(m, hctx->state, hctx_state_name, ARRAY_SIZE(hctx_state_name)); seq_puts(m, "\n"); @@ -181,10 +183,11 @@ static const char *const alloc_policy_name[] = { static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(TAG_QUEUE_SHARED), - HCTX_FLAG_NAME(BLOCKING), - HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), + HCTX_FLAG_NAME(BLOCKING), + HCTX_FLAG_NAME(NO_SCHED), + HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME @@ -193,6 +196,10 @@ static int hctx_flags_show(void *data, struct seq_file *m) struct blk_mq_hw_ctx *hctx = data; const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); + BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != + BLK_MQ_F_ALLOC_POLICY_START_BIT); + BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX); + seq_puts(m, "alloc_policy="); if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && alloc_policy_name[alloc_policy]) @@ -223,12 +230,17 @@ static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(RAHEAD), CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), - CMD_FLAG_NAME(NOUNMAP), CMD_FLAG_NAME(POLLED), + CMD_FLAG_NAME(ALLOC_CACHE), + CMD_FLAG_NAME(SWAP), + CMD_FLAG_NAME(DRV), + CMD_FLAG_NAME(FS_PRIVATE), + CMD_FLAG_NAME(ATOMIC), + CMD_FLAG_NAME(NOUNMAP), }; #undef CMD_FLAG_NAME -#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name +#define RQF_NAME(name) [__RQF_##name] = #name static const char *const rqf_name[] = { RQF_NAME(STARTED), RQF_NAME(FLUSH_SEQ), @@ -243,6 +255,7 @@ static const char *const rqf_name[] = { RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), + RQF_NAME(ZONE_WRITE_PLUGGING), RQF_NAME(TIMED_OUT), RQF_NAME(RESV), }; @@ -268,6 +281,9 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op); + BUILD_BUG_ON(ARRAY_SIZE(cmd_flag_name) != __REQ_NR_BITS); + BUILD_BUG_ON(ARRAY_SIZE(rqf_name) != __RQF_BITS); + seq_printf(m, "%p {.op=", rq); if (strcmp(op_str, "UNKNOWN") == 0) seq_printf(m, "%u", op); diff --git a/block/blk-mq.h b/block/blk-mq.h index 260beea8e332..3bd43b10032f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -36,6 +36,8 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; +#define BLK_MQ_CPU_WORK_BATCH (8) + typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index c1ea214bfc91..1d0db62f0351 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -15,6 +15,7 @@ #define LVB_SIZE 64 #define NEW_DEV_TIMEOUT 5000 +#define WAIT_DLM_LOCK_TIMEOUT (30 * HZ) struct dlm_lock_resource { dlm_lockspace_t *ls; @@ -56,6 +57,7 @@ struct resync_info { #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 #define MD_CLUSTER_PENDING_RECV_EVENT 7 #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 +#define MD_CLUSTER_WAITING_FOR_SYNC 9 struct md_cluster_info { struct mddev *mddev; /* the md device which md_cluster_info belongs to */ @@ -91,6 +93,7 @@ struct md_cluster_info { sector_t sync_hi; }; +/* For compatibility, add the new msg_type at the end. */ enum msg_type { METADATA_UPDATED = 0, RESYNCING, @@ -100,6 +103,7 @@ enum msg_type { BITMAP_NEEDS_SYNC, CHANGE_CAPACITY, BITMAP_RESIZE, + RESYNCING_START, }; struct cluster_msg { @@ -130,8 +134,13 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) 0, sync_ast, res, res->bast); if (ret) return ret; - wait_event(res->sync_locking, res->sync_locking_done); + ret = wait_event_timeout(res->sync_locking, res->sync_locking_done, + WAIT_DLM_LOCK_TIMEOUT); res->sync_locking_done = false; + if (!ret) { + pr_err("locking DLM '%s' timeout!\n", res->name); + return -EBUSY; + } if (res->lksb.sb_status == 0) res->mode = mode; return res->lksb.sb_status; @@ -455,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev, clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); remove_suspend_info(mddev, slot); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); md_wakeup_thread(mddev->thread); return; } @@ -525,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) res = -1; } clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); return res; } @@ -593,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) case CHANGE_CAPACITY: set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); break; + case RESYNCING_START: + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); + break; case RESYNCING: set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); process_suspend_info(mddev, le32_to_cpu(msg->slot), @@ -743,7 +757,7 @@ static void unlock_comm(struct md_cluster_info *cinfo) */ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) { - int error; + int error, unlock_error; int slot = cinfo->slot_number - 1; cmsg->slot = cpu_to_le32(slot); @@ -751,7 +765,7 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); if (error) { pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); - goto failed_message; + return error; } memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, @@ -781,14 +795,10 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) } failed_ack: - error = dlm_unlock_sync(cinfo->message_lockres); - if (unlikely(error != 0)) { + while ((unlock_error = dlm_unlock_sync(cinfo->message_lockres))) pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n", - error); - /* in case the message can't be released due to some reason */ - goto failed_ack; - } -failed_message: + unlock_error); + return error; } @@ -1343,6 +1353,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) spin_unlock_irq(&cinfo->suspend_lock); } +static int resync_status_get(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); +} + +static int resync_start_notify(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + + cmsg.type = cpu_to_le32(RESYNCING_START); + + return sendmsg(cinfo, &cmsg, 0); +} + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -1577,6 +1604,8 @@ static const struct md_cluster_operations cluster_ops = { .resync_start = resync_start, .resync_finish = resync_finish, .resync_info_update = resync_info_update, + .resync_start_notify = resync_start_notify, + .resync_status_get = resync_status_get, .resync_info_get = resync_info_get, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index a78e3021775d..470bf18ffde5 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -14,6 +14,8 @@ struct md_cluster_operations { int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*resync_start_notify)(struct mddev *mddev); + int (*resync_status_get)(struct mddev *mddev); void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 64693913ed18..d3a837506a36 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8978,7 +8978,8 @@ void md_do_sync(struct md_thread *thread) * This will mean we have to start checking from the beginning again. * */ - + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9992,8 +9993,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & - MD_FEATURE_RESHAPE_ACTIVE)) { - rdev2->saved_raid_disk = role; + MD_FEATURE_RESHAPE_ACTIVE) && + !md_cluster_ops->resync_status_get(mddev)) { + /* + * -1 to make raid1_add_disk() set conf->fullsync + * to 1. This could avoid skipping sync when the + * remote node is down during resyncing. + */ + if ((le32_to_cpu(sb->feature_map) + & MD_FEATURE_RECOVERY_OFFSET)) + rdev2->saved_raid_disk = -1; + else + rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %pg\n", rdev2->bdev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 04a0c2ca1732..7acfe7c9dc8d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -680,6 +680,7 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, len = r1_bio->sectors; read_len = raid1_check_read_range(rdev, this_sector, &len); if (read_len == r1_bio->sectors) { + *max_sectors = read_len; update_read_sectors(conf, disk, this_sector, read_len); return disk; } diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 0316c20823ee..6adaeb985dde 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -2248,13 +2248,19 @@ static ssize_t dasd_copy_pair_store(struct device *dev, /* allocate primary devmap if needed */ prim_devmap = dasd_find_busid(prim_busid); - if (IS_ERR(prim_devmap)) + if (IS_ERR(prim_devmap)) { prim_devmap = dasd_add_busid(prim_busid, DASD_FEATURE_DEFAULT); + if (IS_ERR(prim_devmap)) + return PTR_ERR(prim_devmap); + } /* allocate secondary devmap if needed */ sec_devmap = dasd_find_busid(sec_busid); - if (IS_ERR(sec_devmap)) + if (IS_ERR(sec_devmap)) { sec_devmap = dasd_add_busid(sec_busid, DASD_FEATURE_DEFAULT); + if (IS_ERR(sec_devmap)) + return PTR_ERR(sec_devmap); + } /* setting copy relation is only allowed for offline secondary */ if (sec_devmap->device) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index ea4b1d01bb76..8245b742e4a2 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -29,6 +29,7 @@ #include "dasd_int.h" #include "dasd_diag.h" +MODULE_DESCRIPTION("S/390 Support for DIAG access to DASD Disks"); MODULE_LICENSE("GPL"); /* The maximum number of blocks per request (max_blocks) is dependent on the diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index a76c6af9ea63..9388b5c383ca 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -44,6 +44,7 @@ /* 64k are 128 x 512 byte sectors */ #define DASD_RAW_SECTORS_PER_TRACK 128 +MODULE_DESCRIPTION("S/390 DASD ECKD Disks device driver"); MODULE_LICENSE("GPL"); static struct dasd_discipline dasd_eckd_discipline; diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 9f2023a077c2..a2216795591d 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -32,6 +32,7 @@ #define DASD_FBA_CCW_LOCATE 0x43 #define DASD_FBA_CCW_DEFINE_EXTENT 0x63 +MODULE_DESCRIPTION("S/390 DASD FBA Disks device driver"); MODULE_LICENSE("GPL"); static struct dasd_discipline dasd_fba_discipline; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 89ba6b16fe8b..8d304b1d16b1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -27,38 +27,61 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); * request flags */ typedef __u32 __bitwise req_flags_t; -/* drive already may have started this one */ -#define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* request for flush sequence */ -#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) -/* merge of different types, fail separately */ -#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) -/* don't call prep for this one */ -#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) -/* use hctx->sched_tags */ -#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8)) -/* use an I/O scheduler for this request */ -#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9)) -/* vaguely specified driver internal error. Ignored by the block layer */ -#define RQF_FAILED ((__force req_flags_t)(1 << 10)) -/* don't warn about errors */ -#define RQF_QUIET ((__force req_flags_t)(1 << 11)) -/* account into disk and partition IO statistics */ -#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* runtime pm request */ -#define RQF_PM ((__force req_flags_t)(1 << 15)) -/* on IO scheduler merge hash */ -#define RQF_HASHED ((__force req_flags_t)(1 << 16)) -/* track IO completion time */ -#define RQF_STATS ((__force req_flags_t)(1 << 17)) -/* Look at ->special_vec for the actual data payload instead of the - bio chain. */ -#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The request completion needs to be signaled to zone write pluging. */ -#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20)) -/* ->timeout has been called, don't expire again */ -#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) -#define RQF_RESV ((__force req_flags_t)(1 << 23)) +/* Keep rqf_name[] in sync with the definitions below */ +enum { + /* drive already may have started this one */ + __RQF_STARTED, + /* request for flush sequence */ + __RQF_FLUSH_SEQ, + /* merge of different types, fail separately */ + __RQF_MIXED_MERGE, + /* don't call prep for this one */ + __RQF_DONTPREP, + /* use hctx->sched_tags */ + __RQF_SCHED_TAGS, + /* use an I/O scheduler for this request */ + __RQF_USE_SCHED, + /* vaguely specified driver internal error. Ignored by block layer */ + __RQF_FAILED, + /* don't warn about errors */ + __RQF_QUIET, + /* account into disk and partition IO statistics */ + __RQF_IO_STAT, + /* runtime pm request */ + __RQF_PM, + /* on IO scheduler merge hash */ + __RQF_HASHED, + /* track IO completion time */ + __RQF_STATS, + /* Look at ->special_vec for the actual data payload instead of the + bio chain. */ + __RQF_SPECIAL_PAYLOAD, + /* request completion needs to be signaled to zone write plugging. */ + __RQF_ZONE_WRITE_PLUGGING, + /* ->timeout has been called, don't expire again */ + __RQF_TIMED_OUT, + __RQF_RESV, + __RQF_BITS +}; + +#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED)) +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ)) +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE)) +#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP)) +#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS)) +#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED)) +#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED)) +#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET)) +#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT)) +#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM)) +#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED)) +#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS)) +#define RQF_SPECIAL_PAYLOAD \ + ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD)) +#define RQF_ZONE_WRITE_PLUGGING \ + ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING)) +#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT)) +#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -278,8 +301,12 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, }; -#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ +/* Keep alloc_policy_name[] in sync with the definitions below */ +enum { + BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ + BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ + BLK_TAG_ALLOC_MAX +}; /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware @@ -644,6 +671,7 @@ struct blk_mq_ops { #endif }; +/* Keep hctx_flag_name[] in sync with the definitions below */ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, @@ -653,27 +681,17 @@ enum { */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, - BLK_MQ_F_BLOCKING = 1 << 5, + BLK_MQ_F_BLOCKING = 1 << 4, /* Do not allow an I/O scheduler to be configured. */ - BLK_MQ_F_NO_SCHED = 1 << 6, + BLK_MQ_F_NO_SCHED = 1 << 5, + /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ - BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, - BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, + BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, BLK_MQ_F_ALLOC_POLICY_BITS = 1, - - BLK_MQ_S_STOPPED = 0, - BLK_MQ_S_TAG_ACTIVE = 1, - BLK_MQ_S_SCHED_RESTART = 2, - - /* hw queue is inactive after all its CPUs become offline */ - BLK_MQ_S_INACTIVE = 3, - - BLK_MQ_MAX_DEPTH = 10240, - - BLK_MQ_CPU_WORK_BATCH = 8, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ @@ -682,8 +700,19 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) +enum { + /* Keep hctx_state_name[] in sync with the definitions below */ + BLK_MQ_S_STOPPED, + BLK_MQ_S_TAG_ACTIVE, + BLK_MQ_S_SCHED_RESTART, + /* hw queue is inactive after all its CPUs become offline */ + BLK_MQ_S_INACTIVE, + BLK_MQ_S_MAX +}; + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 632edd71f8c6..36ed96133217 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -354,6 +354,7 @@ enum req_op { REQ_OP_LAST = (__force blk_opf_t)36, }; +/* Keep cmd_flag_name[] in sync with the definitions below */ enum req_flag_bits { __REQ_FAILFAST_DEV = /* no driver retries of device errors */ REQ_OP_BITS, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b8196e219ac2..e85ec73a07d5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -588,27 +588,28 @@ struct request_queue { }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ -#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ -#define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ -#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ -#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ -#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ -#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ -#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +enum { + QUEUE_FLAG_DYING, /* queue being torn down */ + QUEUE_FLAG_NOMERGES, /* disable merge attempts */ + QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */ + QUEUE_FLAG_FAIL_IO, /* fake timeout */ + QUEUE_FLAG_NOXMERGES, /* No extended merges */ + QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */ + QUEUE_FLAG_INIT_DONE, /* queue is initialized */ + QUEUE_FLAG_STATS, /* track IO start and completion times */ + QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */ + QUEUE_FLAG_QUIESCED, /* queue has been quiesced */ + QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ + QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ + QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ + QUEUE_FLAG_MAX +}; #define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 293af7f8a694..ae04035b6cbe 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -539,9 +539,6 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; - /* If there is block congestion on this cgroup. */ - atomic_t congestion_count; - /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index d662cf136021..c09cdcc99471 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -36,6 +36,11 @@ struct sbitmap_word { * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; + + /** + * @swap_lock: serializes simultaneous updates of ->word and ->cleared + */ + spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 1e453f825c05..5e2e93307f0d 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -60,12 +60,30 @@ static inline void update_alloc_hint_after_get(struct sbitmap *sb, /* * See if we have deferred clears that we can batch move */ -static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) +static inline bool sbitmap_deferred_clear(struct sbitmap_word *map, + unsigned int depth, unsigned int alloc_hint, bool wrap) { - unsigned long mask; + unsigned long mask, word_mask; - if (!READ_ONCE(map->cleared)) - return false; + guard(spinlock_irqsave)(&map->swap_lock); + + if (!map->cleared) { + if (depth == 0) + return false; + + word_mask = (~0UL) >> (BITS_PER_LONG - depth); + /* + * The current behavior is to always retry after moving + * ->cleared to word, and we change it to retry in case + * of any free bits. To avoid an infinite loop, we need + * to take wrap & alloc_hint into account, otherwise a + * soft lockup may occur. + */ + if (!wrap && alloc_hint) + word_mask &= ~((1UL << alloc_hint) - 1); + + return (READ_ONCE(map->word) & word_mask) != word_mask; + } /* * First get a stable cleared mask, setting the old mask to 0. @@ -85,6 +103,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, bool alloc_hint) { unsigned int bits_per_word; + int i; if (shift < 0) shift = sbitmap_calculate_shift(depth); @@ -116,6 +135,9 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, return -ENOMEM; } + for (i = 0; i < sb->map_nr; i++) + spin_lock_init(&sb->map[i].swap_lock); + return 0; } EXPORT_SYMBOL_GPL(sbitmap_init_node); @@ -126,7 +148,7 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth) unsigned int i; for (i = 0; i < sb->map_nr; i++) - sbitmap_deferred_clear(&sb->map[i]); + sbitmap_deferred_clear(&sb->map[i], 0, 0, 0); sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); @@ -179,7 +201,7 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map, alloc_hint, wrap); if (nr != -1) break; - if (!sbitmap_deferred_clear(map)) + if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap)) break; } while (1); @@ -496,7 +518,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int map_depth = __map_depth(sb, index); unsigned long val; - sbitmap_deferred_clear(map); + sbitmap_deferred_clear(map, 0, 0, 0); val = READ_ONCE(map->word); if (val == (1UL << (map_depth - 1)) - 1) goto next; |