summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-22 11:32:05 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-22 11:32:05 -0700
commit7d080fa867092c1db078dd72d70cb256642f7b18 (patch)
tree6185f09f14510e8d733d05c68c0aa6fcde262cf0
parent0256994887d7c89c2a41d872aac67605bda8f115 (diff)
parent89ed6c9ac69ec398ccb648f5f675b43e8ca679ca (diff)
downloadlinux-stable-7d080fa867092c1db078dd72d70cb256642f7b18.tar.gz
linux-stable-7d080fa867092c1db078dd72d70cb256642f7b18.tar.bz2
linux-stable-7d080fa867092c1db078dd72d70cb256642f7b18.zip
Merge tag 'for-6.11/block-20240722' of git://git.kernel.dk/linux
Pull more block updates from Jens Axboe: - MD fixes via Song: - md-cluster fixes (Heming Zhao) - raid1 fix (Mateusz Jończyk) - s390/dasd module description (Jeff) - Series cleaning up and hardening the blk-mq debugfs flag handling (John, Christoph) - blk-cgroup cleanup (Xiu) - Error polled IO attempts if backend doesn't support it (hexue) - Fix for an sbitmap hang (Yang) * tag 'for-6.11/block-20240722' of git://git.kernel.dk/linux: (23 commits) blk-cgroup: move congestion_count to struct blkcg sbitmap: fix io hung due to race on sbitmap_word::cleared block: avoid polling configuration errors block: Catch possible entries missing from rqf_name[] block: Simplify definition of RQF_NAME() block: Use enum to define RQF_x bit indexes block: Catch possible entries missing from cmd_flag_name[] block: Catch possible entries missing from alloc_policy_name[] block: Catch possible entries missing from hctx_flag_name[] block: Catch possible entries missing from hctx_state_name[] block: Catch possible entries missing from blk_queue_flag_name[] block: Make QUEUE_FLAG_x as an enum block: Relocate BLK_MQ_MAX_DEPTH block: Relocate BLK_MQ_CPU_WORK_BATCH block: remove QUEUE_FLAG_STOPPED block: Add missing entry to hctx_flag_name[] block: Add zone write plugging entry to rqf_name[] block: Add missing entries from cmd_flag_name[] s390/dasd: fix error checks in dasd_copy_pair_store() s390/dasd: add missing MODULE_DESCRIPTION() macros ...
-rw-r--r--block/blk-cgroup.c7
-rw-r--r--block/blk-cgroup.h10
-rw-r--r--block/blk-core.c5
-rw-r--r--block/blk-mq-debugfs.c26
-rw-r--r--block/blk-mq.h2
-rw-r--r--drivers/md/md-cluster.c49
-rw-r--r--drivers/md/md-cluster.h2
-rw-r--r--drivers/md/md.c17
-rw-r--r--drivers/md/raid1.c1
-rw-r--r--drivers/s390/block/dasd_devmap.c10
-rw-r--r--drivers/s390/block/dasd_diag.c1
-rw-r--r--drivers/s390/block/dasd_eckd.c1
-rw-r--r--drivers/s390/block/dasd_fba.c1
-rw-r--r--include/linux/blk-mq.h127
-rw-r--r--include/linux/blk_types.h1
-rw-r--r--include/linux/blkdev.h31
-rw-r--r--include/linux/cgroup-defs.h3
-rw-r--r--include/linux/sbitmap.h5
-rw-r--r--lib/sbitmap.c36
19 files changed, 233 insertions, 102 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 37e6cc91d576..69e70964398c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -2182,12 +2182,13 @@ void blk_cgroup_bio_start(struct bio *bio)
bool blk_cgroup_congested(void)
{
- struct cgroup_subsys_state *css;
+ struct blkcg *blkcg;
bool ret = false;
rcu_read_lock();
- for (css = blkcg_css(); css; css = css->parent) {
- if (atomic_read(&css->cgroup->congestion_count)) {
+ for (blkcg = css_to_blkcg(blkcg_css()); blkcg;
+ blkcg = blkcg_parent(blkcg)) {
+ if (atomic_read(&blkcg->congestion_count)) {
ret = true;
break;
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index bd472a30bc61..864fad4a850b 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -95,6 +95,8 @@ struct blkcg {
struct cgroup_subsys_state css;
spinlock_t lock;
refcount_t online_pin;
+ /* If there is block congestion on this cgroup. */
+ atomic_t congestion_count;
struct radix_tree_root blkg_tree;
struct blkcg_gq __rcu *blkg_hint;
@@ -374,7 +376,7 @@ static inline void blkcg_use_delay(struct blkcg_gq *blkg)
if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
return;
if (atomic_add_return(1, &blkg->use_delay) == 1)
- atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_inc(&blkg->blkcg->congestion_count);
}
static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
@@ -399,7 +401,7 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
if (old == 0)
return 0;
if (old == 1)
- atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_dec(&blkg->blkcg->congestion_count);
return 1;
}
@@ -418,7 +420,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
/* We only want 1 person setting the congestion count for this blkg. */
if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
- atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_inc(&blkg->blkcg->congestion_count);
atomic64_set(&blkg->delay_nsec, delay);
}
@@ -435,7 +437,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
/* We only want 1 person clearing the congestion count for this blkg. */
if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
- atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_dec(&blkg->blkcg->congestion_count);
}
/**
diff --git a/block/blk-core.c b/block/blk-core.c
index 02bceeb36f2c..1217c2cd66dd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -791,8 +791,11 @@ void submit_bio_noacct(struct bio *bio)
}
}
- if (!(q->limits.features & BLK_FEAT_POLL))
+ if (!(q->limits.features & BLK_FEAT_POLL) &&
+ (bio->bi_opf & REQ_POLLED)) {
bio_clear_polled(bio);
+ goto not_supported;
+ }
switch (bio_op(bio)) {
case REQ_OP_READ:
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 344f9e503bdb..5463697a8442 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -5,6 +5,7 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
+#include <linux/build_bug.h>
#include <linux/debugfs.h>
#include "blk.h"
@@ -79,7 +80,6 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
static const char *const blk_queue_flag_name[] = {
- QUEUE_FLAG_NAME(STOPPED),
QUEUE_FLAG_NAME(DYING),
QUEUE_FLAG_NAME(NOMERGES),
QUEUE_FLAG_NAME(SAME_COMP),
@@ -100,6 +100,7 @@ static int queue_state_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
+ BUILD_BUG_ON(ARRAY_SIZE(blk_queue_flag_name) != QUEUE_FLAG_MAX);
blk_flags_show(m, q->queue_flags, blk_queue_flag_name,
ARRAY_SIZE(blk_queue_flag_name));
seq_puts(m, "\n");
@@ -164,6 +165,7 @@ static int hctx_state_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
+ BUILD_BUG_ON(ARRAY_SIZE(hctx_state_name) != BLK_MQ_S_MAX);
blk_flags_show(m, hctx->state, hctx_state_name,
ARRAY_SIZE(hctx_state_name));
seq_puts(m, "\n");
@@ -181,10 +183,11 @@ static const char *const alloc_policy_name[] = {
static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
- HCTX_FLAG_NAME(BLOCKING),
- HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(STACKING),
HCTX_FLAG_NAME(TAG_HCTX_SHARED),
+ HCTX_FLAG_NAME(BLOCKING),
+ HCTX_FLAG_NAME(NO_SCHED),
+ HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT),
};
#undef HCTX_FLAG_NAME
@@ -193,6 +196,10 @@ static int hctx_flags_show(void *data, struct seq_file *m)
struct blk_mq_hw_ctx *hctx = data;
const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
+ BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) !=
+ BLK_MQ_F_ALLOC_POLICY_START_BIT);
+ BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX);
+
seq_puts(m, "alloc_policy=");
if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
alloc_policy_name[alloc_policy])
@@ -223,12 +230,17 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(RAHEAD),
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT),
- CMD_FLAG_NAME(NOUNMAP),
CMD_FLAG_NAME(POLLED),
+ CMD_FLAG_NAME(ALLOC_CACHE),
+ CMD_FLAG_NAME(SWAP),
+ CMD_FLAG_NAME(DRV),
+ CMD_FLAG_NAME(FS_PRIVATE),
+ CMD_FLAG_NAME(ATOMIC),
+ CMD_FLAG_NAME(NOUNMAP),
};
#undef CMD_FLAG_NAME
-#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
+#define RQF_NAME(name) [__RQF_##name] = #name
static const char *const rqf_name[] = {
RQF_NAME(STARTED),
RQF_NAME(FLUSH_SEQ),
@@ -243,6 +255,7 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
+ RQF_NAME(ZONE_WRITE_PLUGGING),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
@@ -268,6 +281,9 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
const enum req_op op = req_op(rq);
const char *op_str = blk_op_str(op);
+ BUILD_BUG_ON(ARRAY_SIZE(cmd_flag_name) != __REQ_NR_BITS);
+ BUILD_BUG_ON(ARRAY_SIZE(rqf_name) != __RQF_BITS);
+
seq_printf(m, "%p {.op=", rq);
if (strcmp(op_str, "UNKNOWN") == 0)
seq_printf(m, "%u", op);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 260beea8e332..3bd43b10032f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,6 +36,8 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
+#define BLK_MQ_CPU_WORK_BATCH (8)
+
typedef unsigned int __bitwise blk_insert_t;
#define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index c1ea214bfc91..1d0db62f0351 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -15,6 +15,7 @@
#define LVB_SIZE 64
#define NEW_DEV_TIMEOUT 5000
+#define WAIT_DLM_LOCK_TIMEOUT (30 * HZ)
struct dlm_lock_resource {
dlm_lockspace_t *ls;
@@ -56,6 +57,7 @@ struct resync_info {
#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
#define MD_CLUSTER_PENDING_RECV_EVENT 7
#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
+#define MD_CLUSTER_WAITING_FOR_SYNC 9
struct md_cluster_info {
struct mddev *mddev; /* the md device which md_cluster_info belongs to */
@@ -91,6 +93,7 @@ struct md_cluster_info {
sector_t sync_hi;
};
+/* For compatibility, add the new msg_type at the end. */
enum msg_type {
METADATA_UPDATED = 0,
RESYNCING,
@@ -100,6 +103,7 @@ enum msg_type {
BITMAP_NEEDS_SYNC,
CHANGE_CAPACITY,
BITMAP_RESIZE,
+ RESYNCING_START,
};
struct cluster_msg {
@@ -130,8 +134,13 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
0, sync_ast, res, res->bast);
if (ret)
return ret;
- wait_event(res->sync_locking, res->sync_locking_done);
+ ret = wait_event_timeout(res->sync_locking, res->sync_locking_done,
+ WAIT_DLM_LOCK_TIMEOUT);
res->sync_locking_done = false;
+ if (!ret) {
+ pr_err("locking DLM '%s' timeout!\n", res->name);
+ return -EBUSY;
+ }
if (res->lksb.sb_status == 0)
res->mode = mode;
return res->lksb.sb_status;
@@ -455,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev,
clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
remove_suspend_info(mddev, slot);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
md_wakeup_thread(mddev->thread);
return;
}
@@ -525,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
res = -1;
}
clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+ set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
return res;
}
@@ -593,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
case CHANGE_CAPACITY:
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
break;
+ case RESYNCING_START:
+ clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state);
+ break;
case RESYNCING:
set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
process_suspend_info(mddev, le32_to_cpu(msg->slot),
@@ -743,7 +757,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
*/
static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
{
- int error;
+ int error, unlock_error;
int slot = cinfo->slot_number - 1;
cmsg->slot = cpu_to_le32(slot);
@@ -751,7 +765,7 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
if (error) {
pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
- goto failed_message;
+ return error;
}
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
@@ -781,14 +795,10 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
- error = dlm_unlock_sync(cinfo->message_lockres);
- if (unlikely(error != 0)) {
+ while ((unlock_error = dlm_unlock_sync(cinfo->message_lockres)))
pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
- error);
- /* in case the message can't be released due to some reason */
- goto failed_ack;
- }
-failed_message:
+ unlock_error);
+
return error;
}
@@ -1343,6 +1353,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
spin_unlock_irq(&cinfo->suspend_lock);
}
+static int resync_status_get(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
+}
+
+static int resync_start_notify(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg = {0};
+
+ cmsg.type = cpu_to_le32(RESYNCING_START);
+
+ return sendmsg(cinfo, &cmsg, 0);
+}
+
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1577,6 +1604,8 @@ static const struct md_cluster_operations cluster_ops = {
.resync_start = resync_start,
.resync_finish = resync_finish,
.resync_info_update = resync_info_update,
+ .resync_start_notify = resync_start_notify,
+ .resync_status_get = resync_status_get,
.resync_info_get = resync_info_get,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index a78e3021775d..470bf18ffde5 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -14,6 +14,8 @@ struct md_cluster_operations {
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+ int (*resync_start_notify)(struct mddev *mddev);
+ int (*resync_status_get)(struct mddev *mddev);
void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 64693913ed18..d3a837506a36 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8978,7 +8978,8 @@ void md_do_sync(struct md_thread *thread)
* This will mean we have to start checking from the beginning again.
*
*/
-
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_start_notify(mddev);
do {
int mddev2_minor = -1;
mddev->curr_resync = MD_RESYNC_DELAYED;
@@ -9992,8 +9993,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
*/
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
- MD_FEATURE_RESHAPE_ACTIVE)) {
- rdev2->saved_raid_disk = role;
+ MD_FEATURE_RESHAPE_ACTIVE) &&
+ !md_cluster_ops->resync_status_get(mddev)) {
+ /*
+ * -1 to make raid1_add_disk() set conf->fullsync
+ * to 1. This could avoid skipping sync when the
+ * remote node is down during resyncing.
+ */
+ if ((le32_to_cpu(sb->feature_map)
+ & MD_FEATURE_RECOVERY_OFFSET))
+ rdev2->saved_raid_disk = -1;
+ else
+ rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %pg\n",
rdev2->bdev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 04a0c2ca1732..7acfe7c9dc8d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -680,6 +680,7 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len == r1_bio->sectors) {
+ *max_sectors = read_len;
update_read_sectors(conf, disk, this_sector, read_len);
return disk;
}
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 0316c20823ee..6adaeb985dde 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -2248,13 +2248,19 @@ static ssize_t dasd_copy_pair_store(struct device *dev,
/* allocate primary devmap if needed */
prim_devmap = dasd_find_busid(prim_busid);
- if (IS_ERR(prim_devmap))
+ if (IS_ERR(prim_devmap)) {
prim_devmap = dasd_add_busid(prim_busid, DASD_FEATURE_DEFAULT);
+ if (IS_ERR(prim_devmap))
+ return PTR_ERR(prim_devmap);
+ }
/* allocate secondary devmap if needed */
sec_devmap = dasd_find_busid(sec_busid);
- if (IS_ERR(sec_devmap))
+ if (IS_ERR(sec_devmap)) {
sec_devmap = dasd_add_busid(sec_busid, DASD_FEATURE_DEFAULT);
+ if (IS_ERR(sec_devmap))
+ return PTR_ERR(sec_devmap);
+ }
/* setting copy relation is only allowed for offline secondary */
if (sec_devmap->device)
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index ea4b1d01bb76..8245b742e4a2 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -29,6 +29,7 @@
#include "dasd_int.h"
#include "dasd_diag.h"
+MODULE_DESCRIPTION("S/390 Support for DIAG access to DASD Disks");
MODULE_LICENSE("GPL");
/* The maximum number of blocks per request (max_blocks) is dependent on the
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index a76c6af9ea63..9388b5c383ca 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -44,6 +44,7 @@
/* 64k are 128 x 512 byte sectors */
#define DASD_RAW_SECTORS_PER_TRACK 128
+MODULE_DESCRIPTION("S/390 DASD ECKD Disks device driver");
MODULE_LICENSE("GPL");
static struct dasd_discipline dasd_eckd_discipline;
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 9f2023a077c2..a2216795591d 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -32,6 +32,7 @@
#define DASD_FBA_CCW_LOCATE 0x43
#define DASD_FBA_CCW_DEFINE_EXTENT 0x63
+MODULE_DESCRIPTION("S/390 DASD FBA Disks device driver");
MODULE_LICENSE("GPL");
static struct dasd_discipline dasd_fba_discipline;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 89ba6b16fe8b..8d304b1d16b1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -27,38 +27,61 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
* request flags */
typedef __u32 __bitwise req_flags_t;
-/* drive already may have started this one */
-#define RQF_STARTED ((__force req_flags_t)(1 << 1))
-/* request for flush sequence */
-#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
-/* merge of different types, fail separately */
-#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5))
-/* don't call prep for this one */
-#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
-/* use hctx->sched_tags */
-#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8))
-/* use an I/O scheduler for this request */
-#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9))
-/* vaguely specified driver internal error. Ignored by the block layer */
-#define RQF_FAILED ((__force req_flags_t)(1 << 10))
-/* don't warn about errors */
-#define RQF_QUIET ((__force req_flags_t)(1 << 11))
-/* account into disk and partition IO statistics */
-#define RQF_IO_STAT ((__force req_flags_t)(1 << 13))
-/* runtime pm request */
-#define RQF_PM ((__force req_flags_t)(1 << 15))
-/* on IO scheduler merge hash */
-#define RQF_HASHED ((__force req_flags_t)(1 << 16))
-/* track IO completion time */
-#define RQF_STATS ((__force req_flags_t)(1 << 17))
-/* Look at ->special_vec for the actual data payload instead of the
- bio chain. */
-#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
-/* The request completion needs to be signaled to zone write pluging. */
-#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20))
-/* ->timeout has been called, don't expire again */
-#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
-#define RQF_RESV ((__force req_flags_t)(1 << 23))
+/* Keep rqf_name[] in sync with the definitions below */
+enum {
+ /* drive already may have started this one */
+ __RQF_STARTED,
+ /* request for flush sequence */
+ __RQF_FLUSH_SEQ,
+ /* merge of different types, fail separately */
+ __RQF_MIXED_MERGE,
+ /* don't call prep for this one */
+ __RQF_DONTPREP,
+ /* use hctx->sched_tags */
+ __RQF_SCHED_TAGS,
+ /* use an I/O scheduler for this request */
+ __RQF_USE_SCHED,
+ /* vaguely specified driver internal error. Ignored by block layer */
+ __RQF_FAILED,
+ /* don't warn about errors */
+ __RQF_QUIET,
+ /* account into disk and partition IO statistics */
+ __RQF_IO_STAT,
+ /* runtime pm request */
+ __RQF_PM,
+ /* on IO scheduler merge hash */
+ __RQF_HASHED,
+ /* track IO completion time */
+ __RQF_STATS,
+ /* Look at ->special_vec for the actual data payload instead of the
+ bio chain. */
+ __RQF_SPECIAL_PAYLOAD,
+ /* request completion needs to be signaled to zone write plugging. */
+ __RQF_ZONE_WRITE_PLUGGING,
+ /* ->timeout has been called, don't expire again */
+ __RQF_TIMED_OUT,
+ __RQF_RESV,
+ __RQF_BITS
+};
+
+#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED))
+#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ))
+#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE))
+#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP))
+#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS))
+#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED))
+#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED))
+#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET))
+#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT))
+#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM))
+#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED))
+#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS))
+#define RQF_SPECIAL_PAYLOAD \
+ ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD))
+#define RQF_ZONE_WRITE_PLUGGING \
+ ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING))
+#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT))
+#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV))
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
@@ -278,8 +301,12 @@ enum blk_eh_timer_return {
BLK_EH_RESET_TIMER,
};
-#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
-#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
+/* Keep alloc_policy_name[] in sync with the definitions below */
+enum {
+ BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */
+ BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */
+ BLK_TAG_ALLOC_MAX
+};
/**
* struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
@@ -644,6 +671,7 @@ struct blk_mq_ops {
#endif
};
+/* Keep hctx_flag_name[] in sync with the definitions below */
enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
@@ -653,27 +681,17 @@ enum {
*/
BLK_MQ_F_STACKING = 1 << 2,
BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
- BLK_MQ_F_BLOCKING = 1 << 5,
+ BLK_MQ_F_BLOCKING = 1 << 4,
/* Do not allow an I/O scheduler to be configured. */
- BLK_MQ_F_NO_SCHED = 1 << 6,
+ BLK_MQ_F_NO_SCHED = 1 << 5,
+
/*
* Select 'none' during queue registration in case of a single hwq
* or shared hwqs instead of 'mq-deadline'.
*/
- BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7,
- BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
+ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6,
+ BLK_MQ_F_ALLOC_POLICY_START_BIT = 7,
BLK_MQ_F_ALLOC_POLICY_BITS = 1,
-
- BLK_MQ_S_STOPPED = 0,
- BLK_MQ_S_TAG_ACTIVE = 1,
- BLK_MQ_S_SCHED_RESTART = 2,
-
- /* hw queue is inactive after all its CPUs become offline */
- BLK_MQ_S_INACTIVE = 3,
-
- BLK_MQ_MAX_DEPTH = 10240,
-
- BLK_MQ_CPU_WORK_BATCH = 8,
};
#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
@@ -682,8 +700,19 @@ enum {
((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
+#define BLK_MQ_MAX_DEPTH (10240)
#define BLK_MQ_NO_HCTX_IDX (-1U)
+enum {
+ /* Keep hctx_state_name[] in sync with the definitions below */
+ BLK_MQ_S_STOPPED,
+ BLK_MQ_S_TAG_ACTIVE,
+ BLK_MQ_S_SCHED_RESTART,
+ /* hw queue is inactive after all its CPUs become offline */
+ BLK_MQ_S_INACTIVE,
+ BLK_MQ_S_MAX
+};
+
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
struct lock_class_key *lkclass);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 632edd71f8c6..36ed96133217 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -354,6 +354,7 @@ enum req_op {
REQ_OP_LAST = (__force blk_opf_t)36,
};
+/* Keep cmd_flag_name[] in sync with the definitions below */
enum req_flag_bits {
__REQ_FAILFAST_DEV = /* no driver retries of device errors */
REQ_OP_BITS,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b8196e219ac2..e85ec73a07d5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -588,27 +588,28 @@ struct request_queue {
};
/* Keep blk_queue_flag_name[] in sync with the definitions below */
-#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */
-#define QUEUE_FLAG_DYING 1 /* queue being torn down */
-#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */
-#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */
-#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */
-#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */
-#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */
-#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */
-#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */
-#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
-#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
-#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
-#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
+enum {
+ QUEUE_FLAG_DYING, /* queue being torn down */
+ QUEUE_FLAG_NOMERGES, /* disable merge attempts */
+ QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */
+ QUEUE_FLAG_FAIL_IO, /* fake timeout */
+ QUEUE_FLAG_NOXMERGES, /* No extended merges */
+ QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */
+ QUEUE_FLAG_INIT_DONE, /* queue is initialized */
+ QUEUE_FLAG_STATS, /* track IO start and completion times */
+ QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */
+ QUEUE_FLAG_QUIESCED, /* queue has been quiesced */
+ QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */
+ QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */
+ QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */
+ QUEUE_FLAG_MAX
+};
#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP)
void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
-#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 293af7f8a694..ae04035b6cbe 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -539,9 +539,6 @@ struct cgroup {
/* used to store eBPF programs */
struct cgroup_bpf bpf;
- /* If there is block congestion on this cgroup. */
- atomic_t congestion_count;
-
/* Used to store internal freezer state */
struct cgroup_freezer_state freezer;
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index d662cf136021..c09cdcc99471 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -36,6 +36,11 @@ struct sbitmap_word {
* @cleared: word holding cleared bits
*/
unsigned long cleared ____cacheline_aligned_in_smp;
+
+ /**
+ * @swap_lock: serializes simultaneous updates of ->word and ->cleared
+ */
+ spinlock_t swap_lock;
} ____cacheline_aligned_in_smp;
/**
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 1e453f825c05..5e2e93307f0d 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -60,12 +60,30 @@ static inline void update_alloc_hint_after_get(struct sbitmap *sb,
/*
* See if we have deferred clears that we can batch move
*/
-static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
+static inline bool sbitmap_deferred_clear(struct sbitmap_word *map,
+ unsigned int depth, unsigned int alloc_hint, bool wrap)
{
- unsigned long mask;
+ unsigned long mask, word_mask;
- if (!READ_ONCE(map->cleared))
- return false;
+ guard(spinlock_irqsave)(&map->swap_lock);
+
+ if (!map->cleared) {
+ if (depth == 0)
+ return false;
+
+ word_mask = (~0UL) >> (BITS_PER_LONG - depth);
+ /*
+ * The current behavior is to always retry after moving
+ * ->cleared to word, and we change it to retry in case
+ * of any free bits. To avoid an infinite loop, we need
+ * to take wrap & alloc_hint into account, otherwise a
+ * soft lockup may occur.
+ */
+ if (!wrap && alloc_hint)
+ word_mask &= ~((1UL << alloc_hint) - 1);
+
+ return (READ_ONCE(map->word) & word_mask) != word_mask;
+ }
/*
* First get a stable cleared mask, setting the old mask to 0.
@@ -85,6 +103,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
bool alloc_hint)
{
unsigned int bits_per_word;
+ int i;
if (shift < 0)
shift = sbitmap_calculate_shift(depth);
@@ -116,6 +135,9 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
return -ENOMEM;
}
+ for (i = 0; i < sb->map_nr; i++)
+ spin_lock_init(&sb->map[i].swap_lock);
+
return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);
@@ -126,7 +148,7 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
unsigned int i;
for (i = 0; i < sb->map_nr; i++)
- sbitmap_deferred_clear(&sb->map[i]);
+ sbitmap_deferred_clear(&sb->map[i], 0, 0, 0);
sb->depth = depth;
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
@@ -179,7 +201,7 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
alloc_hint, wrap);
if (nr != -1)
break;
- if (!sbitmap_deferred_clear(map))
+ if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap))
break;
} while (1);
@@ -496,7 +518,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
unsigned int map_depth = __map_depth(sb, index);
unsigned long val;
- sbitmap_deferred_clear(map);
+ sbitmap_deferred_clear(map, 0, 0, 0);
val = READ_ONCE(map->word);
if (val == (1UL << (map_depth - 1)) - 1)
goto next;