From 0905053bdb5b7ba77ad0c2e5cfc4787c1db3d4f1 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 19 Aug 2020 23:20:29 +0800
Subject: null_blk: Support shared tag bitmap

Support a shared tag bitmap, whereby request tags are unique over all
submission queues, and not just per submission queue.

As such, per device total queue depth is normally hw_queue_depth *
submit_queues, but hw_queue_depth when set. And a similar story for when
shared_tags is set, where that is the queue depth over all null blk
devices.

Signed-off-by: John Garry <john.garry@huawei.com>
Tested-by: Douglas Gilbert <dgilbert@interlog.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk_main.c | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index d74443a9c8fa..55d2ef92be47 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -164,6 +164,10 @@ static bool shared_tags;
 module_param(shared_tags, bool, 0444);
 MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
 
+static bool g_shared_tag_bitmap;
+module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444);
+MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq");
+
 static int g_irqmode = NULL_IRQ_SOFTIRQ;
 
 static int null_set_irqmode(const char *str, const struct kernel_param *kp)
@@ -1692,6 +1696,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
 	set->flags = BLK_MQ_F_SHOULD_MERGE;
 	if (g_no_sched)
 		set->flags |= BLK_MQ_F_NO_SCHED;
+	if (g_shared_tag_bitmap)
+		set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
 	set->driver_data = NULL;
 
 	if ((nullb && nullb->dev->blocking) || g_blocking)
-- 
cgit v1.2.3


From 313b825fa218bef78419d7b4092c1f4af222335d Mon Sep 17 00:00:00 2001
From: Xianting Tian <tian.xianting@h3c.com>
Date: Tue, 18 Aug 2020 13:42:06 +0800
Subject: md: only calculate blocksize once and use i_blocksize()

We alreday has the interface i_blocksize(), which can be used
to get blocksize, so use it.
Only calculate blocksize once and use it within read_page().

Signed-off-by: Xianting Tian <tian.xianting@h3c.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index b10c51988c8e..55b757a223a4 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -357,11 +357,12 @@ static int read_page(struct file *file, unsigned long index,
 	struct inode *inode = file_inode(file);
 	struct buffer_head *bh;
 	sector_t block, blk_cur;
+	unsigned long blocksize = i_blocksize(inode);
 
 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
 		 (unsigned long long)index << PAGE_SHIFT);
 
-	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
+	bh = alloc_page_buffers(page, blocksize, false);
 	if (!bh) {
 		ret = -ENOMEM;
 		goto out;
@@ -383,10 +384,10 @@ static int read_page(struct file *file, unsigned long index,
 
 			bh->b_blocknr = block;
 			bh->b_bdev = inode->i_sb->s_bdev;
-			if (count < (1<<inode->i_blkbits))
+			if (count < blocksize)
 				count = 0;
 			else
-				count -= (1<<inode->i_blkbits);
+				count -= blocksize;
 
 			bh->b_end_io = end_bitmap_write;
 			bh->b_private = bitmap;
-- 
cgit v1.2.3


From 7aba13b7152979450270da5602f16f6f28fd35b8 Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:06 -0400
Subject: md/raid5: add a new member of offset into r5dev

Add a new member of offset into struct r5dev. It indicates the
offset of related dev[i].page. For now, since each device have a
privated page, the value is always 0. Thus, we set offset as 0
when allcate page in grow_buffers() and resize_stripes().

To support following different page offset, we try to use the page
offset rather than '0' directly for async_memcpy() and ops_run_io().

We try to support different page offset for xor compution functions
in the following. To avoid repeatly allocate a new array each time,
we add a memory region into scribble buffer to record offset.

No functional change.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 50 +++++++++++++++++++++++++++++++++++++++-----------
 drivers/md/raid5.h |  1 +
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d589d26c86ea..b64f56c87339 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -477,6 +477,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		}
 		sh->dev[i].page = page;
 		sh->dev[i].orig_page = page;
+		sh->dev[i].offset = 0;
 	}
 
 	return 0;
@@ -1130,7 +1131,7 @@ again:
 				sh->dev[i].vec.bv_page = sh->dev[i].page;
 			bi->bi_vcnt = 1;
 			bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
-			bi->bi_io_vec[0].bv_offset = 0;
+			bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
 			bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
 			bi->bi_write_hint = sh->dev[i].write_hint;
 			if (!rrdev)
@@ -1184,7 +1185,7 @@ again:
 			sh->dev[i].rvec.bv_page = sh->dev[i].page;
 			rbi->bi_vcnt = 1;
 			rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
-			rbi->bi_io_vec[0].bv_offset = 0;
+			rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
 			rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
 			rbi->bi_write_hint = sh->dev[i].write_hint;
 			sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
@@ -1404,14 +1405,25 @@ static addr_conv_t *to_addr_conv(struct stripe_head *sh,
 	return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
 }
 
+/*
+ * Return a pointer to record offset address.
+ */
+static unsigned int *
+to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+	return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
+}
+
 static struct dma_async_tx_descriptor *
 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
 	int disks = sh->disks;
 	struct page **xor_srcs = to_addr_page(percpu, 0);
+	unsigned int *off_srcs = to_addr_offs(sh, percpu);
 	int target = sh->ops.target;
 	struct r5dev *tgt = &sh->dev[target];
 	struct page *xor_dest = tgt->page;
+	unsigned int off_dest = tgt->offset;
 	int count = 0;
 	struct dma_async_tx_descriptor *tx;
 	struct async_submit_ctl submit;
@@ -1423,16 +1435,19 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 
-	for (i = disks; i--; )
-		if (i != target)
+	for (i = disks; i--; ) {
+		if (i != target) {
+			off_srcs[count] = sh->dev[i].offset;
 			xor_srcs[count++] = sh->dev[i].page;
+		}
+	}
 
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
 			  ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
-		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
+		tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 	else
 		tx = async_xor(xor_dest, xor_srcs, 0, count,
@@ -1854,9 +1869,11 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 {
 	int disks = sh->disks;
 	struct page **xor_srcs;
+	unsigned int *off_srcs;
 	struct async_submit_ctl submit;
 	int count, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
+	unsigned int off_dest;
 	int prexor = 0;
 	unsigned long flags;
 	int j = 0;
@@ -1881,24 +1898,31 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 again:
 	count = 0;
 	xor_srcs = to_addr_page(percpu, j);
+	off_srcs = to_addr_offs(sh, percpu);
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
 	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
 		prexor = 1;
+		off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (head_sh->dev[i].written ||
-			    test_bit(R5_InJournal, &head_sh->dev[i].flags))
+			    test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
+				off_srcs[count] = dev->offset;
 				xor_srcs[count++] = dev->page;
+			}
 		}
 	} else {
 		xor_dest = sh->dev[pd_idx].page;
+		off_dest = sh->dev[pd_idx].offset;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i != pd_idx)
+			if (i != pd_idx) {
+				off_srcs[count] = dev->offset;
 				xor_srcs[count++] = dev->page;
+			}
 		}
 	}
 
@@ -1924,7 +1948,7 @@ again:
 	}
 
 	if (unlikely(count == 1))
-		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
+		tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 	else
 		tx = async_xor(xor_dest, xor_srcs, 0, count,
@@ -2253,8 +2277,9 @@ static int scribble_alloc(struct raid5_percpu *percpu,
 			  int num, int cnt)
 {
 	size_t obj_size =
-		sizeof(struct page *) * (num+2) +
-		sizeof(addr_conv_t) * (num+2);
+		sizeof(struct page *) * (num + 2) +
+		sizeof(addr_conv_t) * (num + 2) +
+		sizeof(unsigned int) * (num + 2);
 	void *scribble;
 
 	/*
@@ -2389,6 +2414,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		for(i=0; i<conf->pool_size; i++) {
 			nsh->dev[i].page = osh->dev[i].page;
 			nsh->dev[i].orig_page = osh->dev[i].page;
+			nsh->dev[i].offset = osh->dev[i].offset;
 		}
 		nsh->hash_lock_index = hash;
 		free_stripe(conf->slab_cache, osh);
@@ -2444,6 +2470,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 				struct page *p = alloc_page(GFP_NOIO);
 				nsh->dev[i].page = p;
 				nsh->dev[i].orig_page = p;
+				nsh->dev[i].offset = 0;
 				if (!p)
 					err = -ENOMEM;
 			}
@@ -4369,7 +4396,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 			/* place all the copies on one channel */
 			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
 			tx = async_memcpy(sh2->dev[dd_idx].page,
-					  sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf),
+					  sh->dev[i].page, sh2->dev[dd_idx].offset,
+					  sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
 					  &submit);
 
 			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 16fc29472f5c..6afdd3a78580 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -253,6 +253,7 @@ struct stripe_head {
 		struct bio	req, rreq;
 		struct bio_vec	vec, rvec;
 		struct page	*page, *orig_page;
+		unsigned int    offset;     /* offset of the page */
 		struct bio	*toread, *read, *towrite, *written;
 		sector_t	sector;			/* sector of this page */
 		unsigned long	flags;
-- 
cgit v1.2.3


From 248728dd04f5804ed1ff13bb4db9795899d25dca Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:07 -0400
Subject: md/raid5: make async_copy_data() to support different page offset

ops_run_biofill() and ops_run_biodrain() will call async_copy_data()
to copy sh->dev[i].page from or to bio page. For now, it implies the
offset of dev[i].page is 0. But we want to support different page offset
in the following.

Thus, pass page offset to these functions and replace 'page_offset'
with 'page_offset + poff'.

No functional change.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b64f56c87339..7988d373f18b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1227,7 +1227,7 @@ again:
 
 static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page **page,
-	sector_t sector, struct dma_async_tx_descriptor *tx,
+	unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
 	struct stripe_head *sh, int no_skipcopy)
 {
 	struct bio_vec bvl;
@@ -1273,11 +1273,11 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 				    !no_skipcopy)
 					*page = bio_page;
 				else
-					tx = async_memcpy(*page, bio_page, page_offset,
+					tx = async_memcpy(*page, bio_page, page_offset + poff,
 						  b_offset, clen, &submit);
 			} else
 				tx = async_memcpy(bio_page, *page, b_offset,
-						  page_offset, clen, &submit);
+						  page_offset + poff, clen, &submit);
 		}
 		/* chain the operations */
 		submit.depend_tx = tx;
@@ -1350,6 +1350,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + RAID5_STRIPE_SECTORS(conf)) {
 				tx = async_copy_data(0, rbi, &dev->page,
+						     dev->offset,
 						     dev->sector, tx, sh, 0);
 				rbi = r5_next_bio(conf, rbi, dev->sector);
 			}
@@ -1790,6 +1791,7 @@ again:
 					set_bit(R5_Discard, &dev->flags);
 				else {
 					tx = async_copy_data(1, wbi, &dev->page,
+							     dev->offset,
 							     dev->sector, tx, sh,
 							     r5c_is_writeback(conf->log));
 					if (dev->page != dev->orig_page &&
-- 
cgit v1.2.3


From 29bcff787a2593b2126cfaff612c0b4e560022e9 Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:08 -0400
Subject: md/raid5: add new xor function to support different page offset

raid5 will call async_xor() and async_xor_val() to compute xor.
For now, both of them require the common src/dst page offset. But,
we want them to support different src/dst page offset for following
shared page.

Here, adding two new function async_xor_offs() and async_xor_val_offs()
respectively for async_xor() and async_xor_val().

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 crypto/async_tx/async_xor.c | 120 +++++++++++++++++++++++++++++++++++++-------
 include/linux/async_tx.h    |  11 ++++
 2 files changed, 114 insertions(+), 17 deletions(-)

diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 4e5eebe52e6a..a057ecb1288d 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -97,7 +97,8 @@ do_async_xor(struct dma_chan *chan, struct dmaengine_unmap_data *unmap,
 }
 
 static void
-do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
+do_sync_xor_offs(struct page *dest, unsigned int offset,
+		struct page **src_list, unsigned int *src_offs,
 	    int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
 	int i;
@@ -114,7 +115,8 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	/* convert to buffer pointers */
 	for (i = 0; i < src_cnt; i++)
 		if (src_list[i])
-			srcs[xor_src_cnt++] = page_address(src_list[i]) + offset;
+			srcs[xor_src_cnt++] = page_address(src_list[i]) +
+				(src_offs ? src_offs[i] : offset);
 	src_cnt = xor_src_cnt;
 	/* set destination address */
 	dest_buf = page_address(dest) + offset;
@@ -135,11 +137,31 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	async_tx_sync_epilog(submit);
 }
 
+static inline bool
+dma_xor_aligned_offsets(struct dma_device *device, unsigned int offset,
+		unsigned int *src_offs, int src_cnt, int len)
+{
+	int i;
+
+	if (!is_dma_xor_aligned(device, offset, 0, len))
+		return false;
+
+	if (!src_offs)
+		return true;
+
+	for (i = 0; i < src_cnt; i++) {
+		if (!is_dma_xor_aligned(device, src_offs[i], 0, len))
+			return false;
+	}
+	return true;
+}
+
 /**
- * async_xor - attempt to xor a set of blocks with a dma engine.
+ * async_xor_offs - attempt to xor a set of blocks with a dma engine.
  * @dest: destination page
+ * @offset: dst offset to start transaction
  * @src_list: array of source pages
- * @offset: common src/dst offset to start transaction
+ * @src_offs: array of source pages offset, NULL means common src/dst offset
  * @src_cnt: number of source pages
  * @len: length in bytes
  * @submit: submission / completion modifiers
@@ -157,8 +179,9 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
  * is not specified.
  */
 struct dma_async_tx_descriptor *
-async_xor(struct page *dest, struct page **src_list, unsigned int offset,
-	  int src_cnt, size_t len, struct async_submit_ctl *submit)
+async_xor_offs(struct page *dest, unsigned int offset,
+		struct page **src_list, unsigned int *src_offs,
+		int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR,
 						      &dest, 1, src_list,
@@ -171,7 +194,8 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	if (device)
 		unmap = dmaengine_get_unmap_data(device->dev, src_cnt+1, GFP_NOWAIT);
 
-	if (unmap && is_dma_xor_aligned(device, offset, 0, len)) {
+	if (unmap && dma_xor_aligned_offsets(device, offset,
+				src_offs, src_cnt, len)) {
 		struct dma_async_tx_descriptor *tx;
 		int i, j;
 
@@ -184,7 +208,8 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 				continue;
 			unmap->to_cnt++;
 			unmap->addr[j++] = dma_map_page(device->dev, src_list[i],
-							offset, len, DMA_TO_DEVICE);
+					src_offs ? src_offs[i] : offset,
+					len, DMA_TO_DEVICE);
 		}
 
 		/* map it bidirectional as it may be re-used as a source */
@@ -213,11 +238,42 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 		/* wait for any prerequisite operations */
 		async_tx_quiesce(&submit->depend_tx);
 
-		do_sync_xor(dest, src_list, offset, src_cnt, len, submit);
+		do_sync_xor_offs(dest, offset, src_list, src_offs,
+				src_cnt, len, submit);
 
 		return NULL;
 	}
 }
+EXPORT_SYMBOL_GPL(async_xor_offs);
+
+/**
+ * async_xor - attempt to xor a set of blocks with a dma engine.
+ * @dest: destination page
+ * @src_list: array of source pages
+ * @offset: common src/dst offset to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST
+ *
+ * xor_blocks always uses the dest as a source so the
+ * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in
+ * the calculation.  The assumption with dma eninges is that they only
+ * use the destination buffer as a source when it is explicity specified
+ * in the source list.
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
+ */
+struct dma_async_tx_descriptor *
+async_xor(struct page *dest, struct page **src_list, unsigned int offset,
+	  int src_cnt, size_t len, struct async_submit_ctl *submit)
+{
+	return async_xor_offs(dest, offset, src_list, NULL,
+			src_cnt, len, submit);
+}
 EXPORT_SYMBOL_GPL(async_xor);
 
 static int page_is_zero(struct page *p, unsigned int offset, size_t len)
@@ -237,10 +293,11 @@ xor_val_chan(struct async_submit_ctl *submit, struct page *dest,
 }
 
 /**
- * async_xor_val - attempt a xor parity check with a dma engine.
+ * async_xor_val_offs - attempt a xor parity check with a dma engine.
  * @dest: destination page used if the xor is performed synchronously
+ * @offset: des offset in pages to start transaction
  * @src_list: array of source pages
- * @offset: offset in pages to start transaction
+ * @src_offs: array of source pages offset, NULL means common src/det offset
  * @src_cnt: number of source pages
  * @len: length in bytes
  * @result: 0 if sum == 0 else non-zero
@@ -253,9 +310,10 @@ xor_val_chan(struct async_submit_ctl *submit, struct page *dest,
  * is not specified.
  */
 struct dma_async_tx_descriptor *
-async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
-	      int src_cnt, size_t len, enum sum_check_flags *result,
-	      struct async_submit_ctl *submit)
+async_xor_val_offs(struct page *dest, unsigned int offset,
+		struct page **src_list, unsigned int *src_offs,
+		int src_cnt, size_t len, enum sum_check_flags *result,
+		struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan = xor_val_chan(submit, dest, src_list, src_cnt, len);
 	struct dma_device *device = chan ? chan->device : NULL;
@@ -268,7 +326,7 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 		unmap = dmaengine_get_unmap_data(device->dev, src_cnt, GFP_NOWAIT);
 
 	if (unmap && src_cnt <= device->max_xor &&
-	    is_dma_xor_aligned(device, offset, 0, len)) {
+	    dma_xor_aligned_offsets(device, offset, src_offs, src_cnt, len)) {
 		unsigned long dma_prep_flags = 0;
 		int i;
 
@@ -281,7 +339,8 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 
 		for (i = 0; i < src_cnt; i++) {
 			unmap->addr[i] = dma_map_page(device->dev, src_list[i],
-						      offset, len, DMA_TO_DEVICE);
+					src_offs ? src_offs[i] : offset,
+					len, DMA_TO_DEVICE);
 			unmap->to_cnt++;
 		}
 		unmap->len = len;
@@ -312,7 +371,8 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 		submit->flags |= ASYNC_TX_XOR_DROP_DST;
 		submit->flags &= ~ASYNC_TX_ACK;
 
-		tx = async_xor(dest, src_list, offset, src_cnt, len, submit);
+		tx = async_xor_offs(dest, offset, src_list, src_offs,
+				src_cnt, len, submit);
 
 		async_tx_quiesce(&tx);
 
@@ -325,6 +385,32 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 
 	return tx;
 }
+EXPORT_SYMBOL_GPL(async_xor_val_offs);
+
+/**
+ * async_xor_val - attempt a xor parity check with a dma engine.
+ * @dest: destination page used if the xor is performed synchronously
+ * @src_list: array of source pages
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @result: 0 if sum == 0 else non-zero
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
+ */
+struct dma_async_tx_descriptor *
+async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
+	      int src_cnt, size_t len, enum sum_check_flags *result,
+	      struct async_submit_ctl *submit)
+{
+	return async_xor_val_offs(dest, offset, src_list, NULL, src_cnt,
+			len, result, submit);
+}
 EXPORT_SYMBOL_GPL(async_xor_val);
 
 MODULE_AUTHOR("Intel Corporation");
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 4c328fef403c..8901f3c801ee 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -162,11 +162,22 @@ struct dma_async_tx_descriptor *
 async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	  int src_cnt, size_t len, struct async_submit_ctl *submit);
 
+struct dma_async_tx_descriptor *
+async_xor_offs(struct page *dest, unsigned int offset,
+		struct page **src_list, unsigned int *src_offset,
+		int src_cnt, size_t len, struct async_submit_ctl *submit);
+
 struct dma_async_tx_descriptor *
 async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 	      int src_cnt, size_t len, enum sum_check_flags *result,
 	      struct async_submit_ctl *submit);
 
+struct dma_async_tx_descriptor *
+async_xor_val_offs(struct page *dest, unsigned int offset,
+		struct page **src_list, unsigned int *src_offset,
+		int src_cnt, size_t len, enum sum_check_flags *result,
+		struct async_submit_ctl *submit);
+
 struct dma_async_tx_descriptor *
 async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 	     unsigned int src_offset, size_t len,
-- 
cgit v1.2.3


From a7c224a820c36192888ad0beb11c2e612264f533 Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:09 -0400
Subject: md/raid5: convert to new xor compution interface

We try to replace async_xor() and async_xor_val() with the new
introduced interface async_xor_offs() and async_xor_val_offs()
for raid456.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 43 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7988d373f18b..f34ddadd4e29 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1451,7 +1451,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 		tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 	else
-		tx = async_xor(xor_dest, xor_srcs, 0, count,
+		tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 
 	return tx;
@@ -1509,12 +1509,14 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
 	int disks = sh->disks;
 	struct page **blocks = to_addr_page(percpu, 0);
+	unsigned int *offs = to_addr_offs(sh, percpu);
 	int target;
 	int qd_idx = sh->qd_idx;
 	struct dma_async_tx_descriptor *tx;
 	struct async_submit_ctl submit;
 	struct r5dev *tgt;
 	struct page *dest;
+	unsigned int dest_off;
 	int i;
 	int count;
 
@@ -1533,6 +1535,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 	tgt = &sh->dev[target];
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	dest = tgt->page;
+	dest_off = tgt->offset;
 
 	atomic_inc(&sh->count);
 
@@ -1551,13 +1554,14 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 		for (i = disks; i-- ; ) {
 			if (i == target || i == qd_idx)
 				continue;
+			offs[count] = sh->dev[i].offset;
 			blocks[count++] = sh->dev[i].page;
 		}
 
 		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 				  NULL, ops_complete_compute, sh,
 				  to_addr_conv(sh, percpu, 0));
-		tx = async_xor(dest, blocks, 0, count,
+		tx = async_xor_offs(dest, dest_off, blocks, offs, count,
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 	}
 
@@ -1577,6 +1581,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 	struct r5dev *tgt2 = &sh->dev[target2];
 	struct dma_async_tx_descriptor *tx;
 	struct page **blocks = to_addr_page(percpu, 0);
+	unsigned int *offs = to_addr_offs(sh, percpu);
 	struct async_submit_ctl submit;
 
 	BUG_ON(sh->batch_head);
@@ -1589,13 +1594,16 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 	/* we need to open-code set_syndrome_sources to handle the
 	 * slot number conversion for 'faila' and 'failb'
 	 */
-	for (i = 0; i < disks ; i++)
+	for (i = 0; i < disks ; i++) {
+		offs[i] = 0;
 		blocks[i] = NULL;
+	}
 	count = 0;
 	i = d0_idx;
 	do {
 		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
 
+		offs[slot] = sh->dev[i].offset;
 		blocks[slot] = sh->dev[i].page;
 
 		if (i == target)
@@ -1625,6 +1633,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 						  &submit);
 		} else {
 			struct page *dest;
+			unsigned int dest_off;
 			int data_target;
 			int qd_idx = sh->qd_idx;
 
@@ -1638,14 +1647,16 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			for (i = disks; i-- ; ) {
 				if (i == data_target || i == qd_idx)
 					continue;
+				offs[count] = sh->dev[i].offset;
 				blocks[count++] = sh->dev[i].page;
 			}
 			dest = sh->dev[data_target].page;
+			dest_off = sh->dev[data_target].offset;
 			init_async_submit(&submit,
 					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 					  NULL, NULL, NULL,
 					  to_addr_conv(sh, percpu, 0));
-			tx = async_xor(dest, blocks, 0, count,
+			tx = async_xor_offs(dest, dest_off, blocks, offs, count,
 				       RAID5_STRIPE_SIZE(sh->raid_conf),
 				       &submit);
 
@@ -1698,10 +1709,12 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 {
 	int disks = sh->disks;
 	struct page **xor_srcs = to_addr_page(percpu, 0);
+	unsigned int *off_srcs = to_addr_offs(sh, percpu);
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct async_submit_ctl submit;
 
 	/* existing parity data subtracted */
+	unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 
 	BUG_ON(sh->batch_head);
@@ -1711,15 +1724,22 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_InJournal, &dev->flags))
+		if (test_bit(R5_InJournal, &dev->flags)) {
+			/*
+			 * For this case, PAGE_SIZE must be equal to 4KB and
+			 * page offset is zero.
+			 */
+			off_srcs[count] = dev->offset;
 			xor_srcs[count++] = dev->orig_page;
-		else if (test_bit(R5_Wantdrain, &dev->flags))
+		} else if (test_bit(R5_Wantdrain, &dev->flags)) {
+			off_srcs[count] = dev->offset;
 			xor_srcs[count++] = dev->page;
+		}
 	}
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
 			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
-	tx = async_xor(xor_dest, xor_srcs, 0, count,
+	tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
 			RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 
 	return tx;
@@ -1953,7 +1973,7 @@ again:
 		tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 	else
-		tx = async_xor(xor_dest, xor_srcs, 0, count,
+		tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 	if (!last_stripe) {
 		j++;
@@ -2042,7 +2062,9 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
 	struct page *xor_dest;
+	unsigned int off_dest;
 	struct page **xor_srcs = to_addr_page(percpu, 0);
+	unsigned int *off_srcs = to_addr_offs(sh, percpu);
 	struct dma_async_tx_descriptor *tx;
 	struct async_submit_ctl submit;
 	int count;
@@ -2054,16 +2076,19 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 	BUG_ON(sh->batch_head);
 	count = 0;
 	xor_dest = sh->dev[pd_idx].page;
+	off_dest = sh->dev[pd_idx].offset;
+	off_srcs[count] = off_dest;
 	xor_srcs[count++] = xor_dest;
 	for (i = disks; i--; ) {
 		if (i == pd_idx || i == qd_idx)
 			continue;
+		off_srcs[count] = sh->dev[i].offset;
 		xor_srcs[count++] = sh->dev[i].page;
 	}
 
 	init_async_submit(&submit, 0, NULL, NULL, NULL,
 			  to_addr_conv(sh, percpu, 0));
-	tx = async_xor_val(xor_dest, xor_srcs, 0, count,
+	tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
 			   RAID5_STRIPE_SIZE(sh->raid_conf),
 			   &sh->ops.zero_sum_result, &submit);
 
-- 
cgit v1.2.3


From d69454bc9fdfb63e84152e0533454079ea3e38a3 Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:10 -0400
Subject: md/raid6: let syndrome computor support different page offset

For now, syndrome compute functions require common offset in the pages
array. However, we expect them to support different offset when try to
use shared page in the following. Simplily covert them by adding page
offset where each page address are referred.

Since the only caller of async_gen_syndrome() and async_syndrome_val()
are in raid6, we don't want to reserve the old interface but modify the
interface directly. After that, replacing old interfaces with new ones
for raid6 and raid6test.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 crypto/async_tx/async_pq.c  | 72 ++++++++++++++++++++++++++++++---------------
 crypto/async_tx/raid6test.c | 24 ++++++++++-----
 drivers/md/raid5.c          | 36 +++++++++++++++--------
 include/linux/async_tx.h    |  6 ++--
 4 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 341ece61cf9b..f9cdc5e91664 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -104,7 +104,7 @@ do_async_gen_syndrome(struct dma_chan *chan,
  * do_sync_gen_syndrome - synchronously calculate a raid6 syndrome
  */
 static void
-do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+do_sync_gen_syndrome(struct page **blocks, unsigned int *offsets, int disks,
 		     size_t len, struct async_submit_ctl *submit)
 {
 	void **srcs;
@@ -121,7 +121,8 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 			BUG_ON(i > disks - 3); /* P or Q can't be zero */
 			srcs[i] = (void*)raid6_empty_zero_page;
 		} else {
-			srcs[i] = page_address(blocks[i]) + offset;
+			srcs[i] = page_address(blocks[i]) + offsets[i];
+
 			if (i < disks - 2) {
 				stop = i;
 				if (start == -1)
@@ -138,10 +139,23 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 	async_tx_sync_epilog(submit);
 }
 
+static inline bool
+is_dma_pq_aligned_offs(struct dma_device *dev, unsigned int *offs,
+				     int src_cnt, size_t len)
+{
+	int i;
+
+	for (i = 0; i < src_cnt; i++) {
+		if (!is_dma_pq_aligned(dev, offs[i], 0, len))
+			return false;
+	}
+	return true;
+}
+
 /**
  * async_gen_syndrome - asynchronously calculate a raid6 syndrome
  * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
- * @offset: common offset into each block (src and dest) to start transaction
+ * @offsets: offset array into each block (src and dest) to start transaction
  * @disks: number of blocks (including missing P or Q, see below)
  * @len: length of operation in bytes
  * @submit: submission/completion modifiers
@@ -160,7 +174,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
  * path.
  */
 struct dma_async_tx_descriptor *
-async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+async_gen_syndrome(struct page **blocks, unsigned int *offsets, int disks,
 		   size_t len, struct async_submit_ctl *submit)
 {
 	int src_cnt = disks - 2;
@@ -179,7 +193,7 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 	if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) &&
 	    (src_cnt <= dma_maxpq(device, 0) ||
 	     dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
-	    is_dma_pq_aligned(device, offset, 0, len)) {
+	    is_dma_pq_aligned_offs(device, offsets, disks, len)) {
 		struct dma_async_tx_descriptor *tx;
 		enum dma_ctrl_flags dma_flags = 0;
 		unsigned char coefs[MAX_DISKS];
@@ -196,8 +210,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 		for (i = 0, j = 0; i < src_cnt; i++) {
 			if (blocks[i] == NULL)
 				continue;
-			unmap->addr[j] = dma_map_page(device->dev, blocks[i], offset,
-						      len, DMA_TO_DEVICE);
+			unmap->addr[j] = dma_map_page(device->dev, blocks[i],
+						offsets[i], len, DMA_TO_DEVICE);
 			coefs[j] = raid6_gfexp[i];
 			unmap->to_cnt++;
 			j++;
@@ -210,7 +224,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 		unmap->bidi_cnt++;
 		if (P(blocks, disks))
 			unmap->addr[j++] = dma_map_page(device->dev, P(blocks, disks),
-							offset, len, DMA_BIDIRECTIONAL);
+							P(offsets, disks),
+							len, DMA_BIDIRECTIONAL);
 		else {
 			unmap->addr[j++] = 0;
 			dma_flags |= DMA_PREP_PQ_DISABLE_P;
@@ -219,7 +234,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 		unmap->bidi_cnt++;
 		if (Q(blocks, disks))
 			unmap->addr[j++] = dma_map_page(device->dev, Q(blocks, disks),
-						       offset, len, DMA_BIDIRECTIONAL);
+							Q(offsets, disks),
+							len, DMA_BIDIRECTIONAL);
 		else {
 			unmap->addr[j++] = 0;
 			dma_flags |= DMA_PREP_PQ_DISABLE_Q;
@@ -240,13 +256,13 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 
 	if (!P(blocks, disks)) {
 		P(blocks, disks) = pq_scribble_page;
-		BUG_ON(len + offset > PAGE_SIZE);
+		P(offsets, disks) = 0;
 	}
 	if (!Q(blocks, disks)) {
 		Q(blocks, disks) = pq_scribble_page;
-		BUG_ON(len + offset > PAGE_SIZE);
+		Q(offsets, disks) = 0;
 	}
-	do_sync_gen_syndrome(blocks, offset, disks, len, submit);
+	do_sync_gen_syndrome(blocks, offsets, disks, len, submit);
 
 	return NULL;
 }
@@ -270,6 +286,7 @@ pq_val_chan(struct async_submit_ctl *submit, struct page **blocks, int disks, si
  * @len: length of operation in bytes
  * @pqres: on val failure SUM_CHECK_P_RESULT and/or SUM_CHECK_Q_RESULT are set
  * @spare: temporary result buffer for the synchronous case
+ * @s_off: spare buffer page offset
  * @submit: submission / completion modifiers
  *
  * The same notes from async_gen_syndrome apply to the 'blocks',
@@ -278,9 +295,9 @@ pq_val_chan(struct async_submit_ctl *submit, struct page **blocks, int disks, si
  * specified.
  */
 struct dma_async_tx_descriptor *
-async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
+async_syndrome_val(struct page **blocks, unsigned int *offsets, int disks,
 		   size_t len, enum sum_check_flags *pqres, struct page *spare,
-		   struct async_submit_ctl *submit)
+		   unsigned int s_off, struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan = pq_val_chan(submit, blocks, disks, len);
 	struct dma_device *device = chan ? chan->device : NULL;
@@ -295,7 +312,7 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 		unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOWAIT);
 
 	if (unmap && disks <= dma_maxpq(device, 0) &&
-	    is_dma_pq_aligned(device, offset, 0, len)) {
+	    is_dma_pq_aligned_offs(device, offsets, disks, len)) {
 		struct device *dev = device->dev;
 		dma_addr_t pq[2];
 		int i, j = 0, src_cnt = 0;
@@ -307,7 +324,7 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 		for (i = 0; i < disks-2; i++)
 			if (likely(blocks[i])) {
 				unmap->addr[j] = dma_map_page(dev, blocks[i],
-							      offset, len,
+							      offsets[i], len,
 							      DMA_TO_DEVICE);
 				coefs[j] = raid6_gfexp[i];
 				unmap->to_cnt++;
@@ -320,7 +337,7 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 			dma_flags |= DMA_PREP_PQ_DISABLE_P;
 		} else {
 			pq[0] = dma_map_page(dev, P(blocks, disks),
-					     offset, len,
+					     P(offsets, disks), len,
 					     DMA_TO_DEVICE);
 			unmap->addr[j++] = pq[0];
 			unmap->to_cnt++;
@@ -330,7 +347,7 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 			dma_flags |= DMA_PREP_PQ_DISABLE_Q;
 		} else {
 			pq[1] = dma_map_page(dev, Q(blocks, disks),
-					     offset, len,
+					     Q(offsets, disks), len,
 					     DMA_TO_DEVICE);
 			unmap->addr[j++] = pq[1];
 			unmap->to_cnt++;
@@ -355,7 +372,9 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 		async_tx_submit(chan, tx, submit);
 	} else {
 		struct page *p_src = P(blocks, disks);
+		unsigned int p_off = P(offsets, disks);
 		struct page *q_src = Q(blocks, disks);
+		unsigned int q_off = Q(offsets, disks);
 		enum async_tx_flags flags_orig = submit->flags;
 		dma_async_tx_callback cb_fn_orig = submit->cb_fn;
 		void *scribble = submit->scribble;
@@ -381,27 +400,32 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 		if (p_src) {
 			init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
 					  NULL, NULL, scribble);
-			tx = async_xor(spare, blocks, offset, disks-2, len, submit);
+			tx = async_xor_offs(spare, s_off,
+					blocks, offsets, disks-2, len, submit);
 			async_tx_quiesce(&tx);
-			p = page_address(p_src) + offset;
-			s = page_address(spare) + offset;
+			p = page_address(p_src) + p_off;
+			s = page_address(spare) + s_off;
 			*pqres |= !!memcmp(p, s, len) << SUM_CHECK_P;
 		}
 
 		if (q_src) {
 			P(blocks, disks) = NULL;
 			Q(blocks, disks) = spare;
+			Q(offsets, disks) = s_off;
 			init_async_submit(submit, 0, NULL, NULL, NULL, scribble);
-			tx = async_gen_syndrome(blocks, offset, disks, len, submit);
+			tx = async_gen_syndrome(blocks, offsets, disks,
+					len, submit);
 			async_tx_quiesce(&tx);
-			q = page_address(q_src) + offset;
-			s = page_address(spare) + offset;
+			q = page_address(q_src) + q_off;
+			s = page_address(spare) + s_off;
 			*pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q;
 		}
 
 		/* restore P, Q and submit */
 		P(blocks, disks) = p_src;
+		P(offsets, disks) = p_off;
 		Q(blocks, disks) = q_src;
+		Q(offsets, disks) = q_off;
 
 		submit->cb_fn = cb_fn_orig;
 		submit->cb_param = cb_param_orig;
diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c
index 14e73dcd7475..ef4e6e68f759 100644
--- a/crypto/async_tx/raid6test.c
+++ b/crypto/async_tx/raid6test.c
@@ -18,6 +18,7 @@
 #define NDISKS 64 /* Including P and Q */
 
 static struct page *dataptrs[NDISKS];
+unsigned int dataoffs[NDISKS];
 static addr_conv_t addr_conv[NDISKS];
 static struct page *data[NDISKS+3];
 static struct page *spare;
@@ -38,6 +39,7 @@ static void makedata(int disks)
 	for (i = 0; i < disks; i++) {
 		prandom_bytes(page_address(data[i]), PAGE_SIZE);
 		dataptrs[i] = data[i];
+		dataoffs[i] = 0;
 	}
 }
 
@@ -52,7 +54,8 @@ static char disk_type(int d, int disks)
 }
 
 /* Recover two failed blocks. */
-static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, struct page **ptrs)
+static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
+		struct page **ptrs, unsigned int *offs)
 {
 	struct async_submit_ctl submit;
 	struct completion cmp;
@@ -66,7 +69,8 @@ static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, stru
 		if (faila == disks-2) {
 			/* P+Q failure.  Just rebuild the syndrome. */
 			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
-			tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
+			tx = async_gen_syndrome(ptrs, offs,
+					disks, bytes, &submit);
 		} else {
 			struct page *blocks[NDISKS];
 			struct page *dest;
@@ -89,22 +93,26 @@ static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, stru
 			tx = async_xor(dest, blocks, 0, count, bytes, &submit);
 
 			init_async_submit(&submit, 0, tx, NULL, NULL, addr_conv);
-			tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
+			tx = async_gen_syndrome(ptrs, offs,
+					disks, bytes, &submit);
 		}
 	} else {
 		if (failb == disks-2) {
 			/* data+P failure. */
 			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
-			tx = async_raid6_datap_recov(disks, bytes, faila, ptrs, &submit);
+			tx = async_raid6_datap_recov(disks, bytes,
+					faila, ptrs, &submit);
 		} else {
 			/* data+data failure. */
 			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
-			tx = async_raid6_2data_recov(disks, bytes, faila, failb, ptrs, &submit);
+			tx = async_raid6_2data_recov(disks, bytes,
+					faila, failb, ptrs, &submit);
 		}
 	}
 	init_completion(&cmp);
 	init_async_submit(&submit, ASYNC_TX_ACK, tx, callback, &cmp, addr_conv);
-	tx = async_syndrome_val(ptrs, 0, disks, bytes, &result, spare, &submit);
+	tx = async_syndrome_val(ptrs, offs,
+			disks, bytes, &result, spare, 0, &submit);
 	async_tx_issue_pending(tx);
 
 	if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0)
@@ -126,7 +134,7 @@ static int test_disks(int i, int j, int disks)
 	dataptrs[i] = recovi;
 	dataptrs[j] = recovj;
 
-	raid6_dual_recov(disks, PAGE_SIZE, i, j, dataptrs);
+	raid6_dual_recov(disks, PAGE_SIZE, i, j, dataptrs, dataoffs);
 
 	erra = memcmp(page_address(data[i]), page_address(recovi), PAGE_SIZE);
 	errb = memcmp(page_address(data[j]), page_address(recovj), PAGE_SIZE);
@@ -162,7 +170,7 @@ static int test(int disks, int *tests)
 	/* Generate assumed good syndrome */
 	init_completion(&cmp);
 	init_async_submit(&submit, ASYNC_TX_ACK, NULL, callback, &cmp, addr_conv);
-	tx = async_gen_syndrome(dataptrs, 0, disks, PAGE_SIZE, &submit);
+	tx = async_gen_syndrome(dataptrs, dataoffs, disks, PAGE_SIZE, &submit);
 	async_tx_issue_pending(tx);
 
 	if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f34ddadd4e29..35f0317f0717 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1459,6 +1459,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 
 /* set_syndrome_sources - populate source buffers for gen_syndrome
  * @srcs - (struct page *) array of size sh->disks
+ * @offs - (unsigned int) array of offset for each page
  * @sh - stripe_head to parse
  *
  * Populates srcs in proper layout order for the stripe and returns the
@@ -1467,6 +1468,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
  * is recorded in srcs[count+1]].
  */
 static int set_syndrome_sources(struct page **srcs,
+				unsigned int *offs,
 				struct stripe_head *sh,
 				int srctype)
 {
@@ -1497,6 +1499,12 @@ static int set_syndrome_sources(struct page **srcs,
 				srcs[slot] = sh->dev[i].orig_page;
 			else
 				srcs[slot] = sh->dev[i].page;
+			/*
+			 * For R5_InJournal, PAGE_SIZE must be 4KB and will
+			 * not shared page. In that case, dev[i].offset
+			 * is 0.
+			 */
+			offs[slot] = sh->dev[i].offset;
 		}
 		i = raid6_next_disk(i, disks);
 	} while (i != d0_idx);
@@ -1540,13 +1548,13 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 	atomic_inc(&sh->count);
 
 	if (target == qd_idx) {
-		count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
+		count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
 		blocks[count] = NULL; /* regenerating p is not necessary */
 		BUG_ON(blocks[count+1] != dest); /* q should already be set */
 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 				  ops_complete_compute, sh,
 				  to_addr_conv(sh, percpu, 0));
-		tx = async_gen_syndrome(blocks, 0, count+2,
+		tx = async_gen_syndrome(blocks, offs, count+2,
 				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 	} else {
 		/* Compute any data- or p-drive using XOR */
@@ -1628,7 +1636,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 					  ops_complete_compute, sh,
 					  to_addr_conv(sh, percpu, 0));
-			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
+			return async_gen_syndrome(blocks, offs, syndrome_disks+2,
 						  RAID5_STRIPE_SIZE(sh->raid_conf),
 						  &submit);
 		} else {
@@ -1660,11 +1668,11 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 				       RAID5_STRIPE_SIZE(sh->raid_conf),
 				       &submit);
 
-			count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
+			count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
 			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
 					  ops_complete_compute, sh,
 					  to_addr_conv(sh, percpu, 0));
-			return async_gen_syndrome(blocks, 0, count+2,
+			return async_gen_syndrome(blocks, offs, count+2,
 						  RAID5_STRIPE_SIZE(sh->raid_conf),
 						  &submit);
 		}
@@ -1750,17 +1758,18 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
 		struct dma_async_tx_descriptor *tx)
 {
 	struct page **blocks = to_addr_page(percpu, 0);
+	unsigned int *offs = to_addr_offs(sh, percpu);
 	int count;
 	struct async_submit_ctl submit;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+	count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
 			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
-	tx = async_gen_syndrome(blocks, 0, count+2,
+	tx = async_gen_syndrome(blocks, offs, count+2,
 			RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
 
 	return tx;
@@ -1989,6 +1998,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 {
 	struct async_submit_ctl submit;
 	struct page **blocks;
+	unsigned int *offs;
 	int count, i, j = 0;
 	struct stripe_head *head_sh = sh;
 	int last_stripe;
@@ -2013,6 +2023,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 
 again:
 	blocks = to_addr_page(percpu, j);
+	offs = to_addr_offs(sh, percpu);
 
 	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
 		synflags = SYNDROME_SRC_WRITTEN;
@@ -2022,7 +2033,7 @@ again:
 		txflags = ASYNC_TX_ACK;
 	}
 
-	count = set_syndrome_sources(blocks, sh, synflags);
+	count = set_syndrome_sources(blocks, offs, sh, synflags);
 	last_stripe = !head_sh->batch_head ||
 		list_first_entry(&sh->batch_list,
 				 struct stripe_head, batch_list) == head_sh;
@@ -2034,7 +2045,7 @@ again:
 	} else
 		init_async_submit(&submit, 0, tx, NULL, NULL,
 				  to_addr_conv(sh, percpu, j));
-	tx = async_gen_syndrome(blocks, 0, count+2,
+	tx = async_gen_syndrome(blocks, offs, count+2,
 			RAID5_STRIPE_SIZE(sh->raid_conf),  &submit);
 	if (!last_stripe) {
 		j++;
@@ -2100,6 +2111,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
 {
 	struct page **srcs = to_addr_page(percpu, 0);
+	unsigned int *offs = to_addr_offs(sh, percpu);
 	struct async_submit_ctl submit;
 	int count;
 
@@ -2107,16 +2119,16 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 		(unsigned long long)sh->sector, checkp);
 
 	BUG_ON(sh->batch_head);
-	count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
+	count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
 	if (!checkp)
 		srcs[count] = NULL;
 
 	atomic_inc(&sh->count);
 	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
 			  sh, to_addr_conv(sh, percpu, 0));
-	async_syndrome_val(srcs, 0, count+2,
+	async_syndrome_val(srcs, offs, count+2,
 			   RAID5_STRIPE_SIZE(sh->raid_conf),
-			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
+			   &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
 }
 
 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 8901f3c801ee..efc5510f7d11 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -186,13 +186,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
-async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
+async_gen_syndrome(struct page **blocks, unsigned int *offsets, int src_cnt,
 		   size_t len, struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
-async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt,
+async_syndrome_val(struct page **blocks, unsigned int *offsets, int src_cnt,
 		   size_t len, enum sum_check_flags *pqres, struct page *spare,
-		   struct async_submit_ctl *submit);
+		   unsigned int s_off, struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb,
-- 
cgit v1.2.3


From 4f86ff5580fa692f850f37f948c73814a24a722b Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:11 -0400
Subject: md/raid6: let async recovery function support different page offset

For now, asynchronous raid6 recovery calculate functions are require
common offset for pages. But, we expect them to support different page
offset after introducing stripe shared page. Do that by simplily adding
page offset where each page address are referred. Then, replace the
old interface with the new ones in raid6 and raid6test.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 crypto/async_tx/async_raid6_recov.c | 163 ++++++++++++++++++++++++++----------
 crypto/async_tx/raid6test.c         |   4 +-
 drivers/md/raid5.c                  |   4 +-
 include/linux/async_tx.h            |   6 +-
 4 files changed, 128 insertions(+), 49 deletions(-)

diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
index f249142ceac4..354b8cd5537f 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -15,8 +15,9 @@
 #include <linux/dmaengine.h>
 
 static struct dma_async_tx_descriptor *
-async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
-		  size_t len, struct async_submit_ctl *submit)
+async_sum_product(struct page *dest, unsigned int d_off,
+		struct page **srcs, unsigned int *src_offs, unsigned char *coef,
+		size_t len, struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
 						      &dest, 1, srcs, 2, len);
@@ -37,11 +38,14 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
 
 		if (submit->flags & ASYNC_TX_FENCE)
 			dma_flags |= DMA_PREP_FENCE;
-		unmap->addr[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
-		unmap->addr[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
+		unmap->addr[0] = dma_map_page(dev, srcs[0], src_offs[0],
+						len, DMA_TO_DEVICE);
+		unmap->addr[1] = dma_map_page(dev, srcs[1], src_offs[1],
+						len, DMA_TO_DEVICE);
 		unmap->to_cnt = 2;
 
-		unmap->addr[2] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+		unmap->addr[2] = dma_map_page(dev, dest, d_off,
+						len, DMA_BIDIRECTIONAL);
 		unmap->bidi_cnt = 1;
 		/* engine only looks at Q, but expects it to follow P */
 		pq[1] = unmap->addr[2];
@@ -66,9 +70,9 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
 	async_tx_quiesce(&submit->depend_tx);
 	amul = raid6_gfmul[coef[0]];
 	bmul = raid6_gfmul[coef[1]];
-	a = page_address(srcs[0]);
-	b = page_address(srcs[1]);
-	c = page_address(dest);
+	a = page_address(srcs[0]) + src_offs[0];
+	b = page_address(srcs[1]) + src_offs[1];
+	c = page_address(dest) + d_off;
 
 	while (len--) {
 		ax    = amul[*a++];
@@ -80,8 +84,9 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
 }
 
 static struct dma_async_tx_descriptor *
-async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
-	   struct async_submit_ctl *submit)
+async_mult(struct page *dest, unsigned int d_off, struct page *src,
+		unsigned int s_off, u8 coef, size_t len,
+		struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
 						      &dest, 1, &src, 1, len);
@@ -101,9 +106,11 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
 
 		if (submit->flags & ASYNC_TX_FENCE)
 			dma_flags |= DMA_PREP_FENCE;
-		unmap->addr[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
+		unmap->addr[0] = dma_map_page(dev, src, s_off,
+						len, DMA_TO_DEVICE);
 		unmap->to_cnt++;
-		unmap->addr[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+		unmap->addr[1] = dma_map_page(dev, dest, d_off,
+						len, DMA_BIDIRECTIONAL);
 		dma_dest[1] = unmap->addr[1];
 		unmap->bidi_cnt++;
 		unmap->len = len;
@@ -133,8 +140,8 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
 	 */
 	async_tx_quiesce(&submit->depend_tx);
 	qmul  = raid6_gfmul[coef];
-	d = page_address(dest);
-	s = page_address(src);
+	d = page_address(dest) + d_off;
+	s = page_address(src) + s_off;
 
 	while (len--)
 		*d++ = qmul[*s++];
@@ -144,11 +151,14 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
 
 static struct dma_async_tx_descriptor *
 __2data_recov_4(int disks, size_t bytes, int faila, int failb,
-		struct page **blocks, struct async_submit_ctl *submit)
+		struct page **blocks, unsigned int *offs,
+		struct async_submit_ctl *submit)
 {
 	struct dma_async_tx_descriptor *tx = NULL;
 	struct page *p, *q, *a, *b;
+	unsigned int p_off, q_off, a_off, b_off;
 	struct page *srcs[2];
+	unsigned int src_offs[2];
 	unsigned char coef[2];
 	enum async_tx_flags flags = submit->flags;
 	dma_async_tx_callback cb_fn = submit->cb_fn;
@@ -156,26 +166,34 @@ __2data_recov_4(int disks, size_t bytes, int faila, int failb,
 	void *scribble = submit->scribble;
 
 	p = blocks[disks-2];
+	p_off = offs[disks-2];
 	q = blocks[disks-1];
+	q_off = offs[disks-1];
 
 	a = blocks[faila];
+	a_off = offs[faila];
 	b = blocks[failb];
+	b_off = offs[failb];
 
 	/* in the 4 disk case P + Pxy == P and Q + Qxy == Q */
 	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
 	srcs[0] = p;
+	src_offs[0] = p_off;
 	srcs[1] = q;
+	src_offs[1] = q_off;
 	coef[0] = raid6_gfexi[failb-faila];
 	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
 	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
-	tx = async_sum_product(b, srcs, coef, bytes, submit);
+	tx = async_sum_product(b, b_off, srcs, src_offs, coef, bytes, submit);
 
 	/* Dy = P+Pxy+Dx */
 	srcs[0] = p;
+	src_offs[0] = p_off;
 	srcs[1] = b;
+	src_offs[1] = b_off;
 	init_async_submit(submit, flags | ASYNC_TX_XOR_ZERO_DST, tx, cb_fn,
 			  cb_param, scribble);
-	tx = async_xor(a, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(a, a_off, srcs, src_offs, 2, bytes, submit);
 
 	return tx;
 
@@ -183,11 +201,14 @@ __2data_recov_4(int disks, size_t bytes, int faila, int failb,
 
 static struct dma_async_tx_descriptor *
 __2data_recov_5(int disks, size_t bytes, int faila, int failb,
-		struct page **blocks, struct async_submit_ctl *submit)
+		struct page **blocks, unsigned int *offs,
+		struct async_submit_ctl *submit)
 {
 	struct dma_async_tx_descriptor *tx = NULL;
 	struct page *p, *q, *g, *dp, *dq;
+	unsigned int p_off, q_off, g_off, dp_off, dq_off;
 	struct page *srcs[2];
+	unsigned int src_offs[2];
 	unsigned char coef[2];
 	enum async_tx_flags flags = submit->flags;
 	dma_async_tx_callback cb_fn = submit->cb_fn;
@@ -208,60 +229,77 @@ __2data_recov_5(int disks, size_t bytes, int faila, int failb,
 	BUG_ON(good_srcs > 1);
 
 	p = blocks[disks-2];
+	p_off = offs[disks-2];
 	q = blocks[disks-1];
+	q_off = offs[disks-1];
 	g = blocks[good];
+	g_off = offs[good];
 
 	/* Compute syndrome with zero for the missing data pages
 	 * Use the dead data pages as temporary storage for delta p and
 	 * delta q
 	 */
 	dp = blocks[faila];
+	dp_off = offs[faila];
 	dq = blocks[failb];
+	dq_off = offs[failb];
 
 	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
-	tx = async_memcpy(dp, g, 0, 0, bytes, submit);
+	tx = async_memcpy(dp, g, dp_off, g_off, bytes, submit);
 	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
-	tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+	tx = async_mult(dq, dq_off, g, g_off,
+			raid6_gfexp[good], bytes, submit);
 
 	/* compute P + Pxy */
 	srcs[0] = dp;
+	src_offs[0] = dp_off;
 	srcs[1] = p;
+	src_offs[1] = p_off;
 	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
 			  NULL, NULL, scribble);
-	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(dp, dp_off, srcs, src_offs, 2, bytes, submit);
 
 	/* compute Q + Qxy */
 	srcs[0] = dq;
+	src_offs[0] = dq_off;
 	srcs[1] = q;
+	src_offs[1] = q_off;
 	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
 			  NULL, NULL, scribble);
-	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(dq, dq_off, srcs, src_offs, 2, bytes, submit);
 
 	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
 	srcs[0] = dp;
+	src_offs[0] = dp_off;
 	srcs[1] = dq;
+	src_offs[1] = dq_off;
 	coef[0] = raid6_gfexi[failb-faila];
 	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
 	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
-	tx = async_sum_product(dq, srcs, coef, bytes, submit);
+	tx = async_sum_product(dq, dq_off, srcs, src_offs, coef, bytes, submit);
 
 	/* Dy = P+Pxy+Dx */
 	srcs[0] = dp;
+	src_offs[0] = dp_off;
 	srcs[1] = dq;
+	src_offs[1] = dq_off;
 	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
 			  cb_param, scribble);
-	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(dp, dp_off, srcs, src_offs, 2, bytes, submit);
 
 	return tx;
 }
 
 static struct dma_async_tx_descriptor *
 __2data_recov_n(int disks, size_t bytes, int faila, int failb,
-	      struct page **blocks, struct async_submit_ctl *submit)
+	      struct page **blocks, unsigned int *offs,
+		  struct async_submit_ctl *submit)
 {
 	struct dma_async_tx_descriptor *tx = NULL;
 	struct page *p, *q, *dp, *dq;
+	unsigned int p_off, q_off, dp_off, dq_off;
 	struct page *srcs[2];
+	unsigned int src_offs[2];
 	unsigned char coef[2];
 	enum async_tx_flags flags = submit->flags;
 	dma_async_tx_callback cb_fn = submit->cb_fn;
@@ -269,56 +307,74 @@ __2data_recov_n(int disks, size_t bytes, int faila, int failb,
 	void *scribble = submit->scribble;
 
 	p = blocks[disks-2];
+	p_off = offs[disks-2];
 	q = blocks[disks-1];
+	q_off = offs[disks-1];
 
 	/* Compute syndrome with zero for the missing data pages
 	 * Use the dead data pages as temporary storage for
 	 * delta p and delta q
 	 */
 	dp = blocks[faila];
+	dp_off = offs[faila];
 	blocks[faila] = NULL;
 	blocks[disks-2] = dp;
+	offs[disks-2] = dp_off;
 	dq = blocks[failb];
+	dq_off = offs[failb];
 	blocks[failb] = NULL;
 	blocks[disks-1] = dq;
+	offs[disks-1] = dq_off;
 
 	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
-	tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+	tx = async_gen_syndrome(blocks, offs, disks, bytes, submit);
 
 	/* Restore pointer table */
 	blocks[faila]   = dp;
+	offs[faila] = dp_off;
 	blocks[failb]   = dq;
+	offs[failb] = dq_off;
 	blocks[disks-2] = p;
+	offs[disks-2] = p_off;
 	blocks[disks-1] = q;
+	offs[disks-1] = q_off;
 
 	/* compute P + Pxy */
 	srcs[0] = dp;
+	src_offs[0] = dp_off;
 	srcs[1] = p;
+	src_offs[1] = p_off;
 	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
 			  NULL, NULL, scribble);
-	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(dp, dp_off, srcs, src_offs, 2, bytes, submit);
 
 	/* compute Q + Qxy */
 	srcs[0] = dq;
+	src_offs[0] = dq_off;
 	srcs[1] = q;
+	src_offs[1] = q_off;
 	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
 			  NULL, NULL, scribble);
-	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(dq, dq_off, srcs, src_offs, 2, bytes, submit);
 
 	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
 	srcs[0] = dp;
+	src_offs[0] = dp_off;
 	srcs[1] = dq;
+	src_offs[1] = dq_off;
 	coef[0] = raid6_gfexi[failb-faila];
 	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
 	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
-	tx = async_sum_product(dq, srcs, coef, bytes, submit);
+	tx = async_sum_product(dq, dq_off, srcs, src_offs, coef, bytes, submit);
 
 	/* Dy = P+Pxy+Dx */
 	srcs[0] = dp;
+	src_offs[0] = dp_off;
 	srcs[1] = dq;
+	src_offs[1] = dq_off;
 	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
 			  cb_param, scribble);
-	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(dp, dp_off, srcs, src_offs, 2, bytes, submit);
 
 	return tx;
 }
@@ -330,11 +386,13 @@ __2data_recov_n(int disks, size_t bytes, int faila, int failb,
  * @faila: first failed drive index
  * @failb: second failed drive index
  * @blocks: array of source pointers where the last two entries are p and q
+ * @offs: array of offset for pages in blocks
  * @submit: submission/completion modifiers
  */
 struct dma_async_tx_descriptor *
 async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
-			struct page **blocks, struct async_submit_ctl *submit)
+			struct page **blocks, unsigned int *offs,
+			struct async_submit_ctl *submit)
 {
 	void *scribble = submit->scribble;
 	int non_zero_srcs, i;
@@ -358,7 +416,7 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 			if (blocks[i] == NULL)
 				ptrs[i] = (void *) raid6_empty_zero_page;
 			else
-				ptrs[i] = page_address(blocks[i]);
+				ptrs[i] = page_address(blocks[i]) + offs[i];
 
 		raid6_2data_recov(disks, bytes, faila, failb, ptrs);
 
@@ -383,16 +441,19 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		 * explicitly handle the special case of a 4 disk array with
 		 * both data disks missing.
 		 */
-		return __2data_recov_4(disks, bytes, faila, failb, blocks, submit);
+		return __2data_recov_4(disks, bytes, faila, failb,
+				blocks, offs, submit);
 	case 3:
 		/* dma devices do not uniformly understand a single
 		 * source pq operation (in contrast to the synchronous
 		 * case), so explicitly handle the special case of a 5 disk
 		 * array with 2 of 3 data disks missing.
 		 */
-		return __2data_recov_5(disks, bytes, faila, failb, blocks, submit);
+		return __2data_recov_5(disks, bytes, faila, failb,
+				blocks, offs, submit);
 	default:
-		return __2data_recov_n(disks, bytes, faila, failb, blocks, submit);
+		return __2data_recov_n(disks, bytes, faila, failb,
+				blocks, offs, submit);
 	}
 }
 EXPORT_SYMBOL_GPL(async_raid6_2data_recov);
@@ -403,14 +464,17 @@ EXPORT_SYMBOL_GPL(async_raid6_2data_recov);
  * @bytes: block size
  * @faila: failed drive index
  * @blocks: array of source pointers where the last two entries are p and q
+ * @offs: array of offset for pages in blocks
  * @submit: submission/completion modifiers
  */
 struct dma_async_tx_descriptor *
 async_raid6_datap_recov(int disks, size_t bytes, int faila,
-			struct page **blocks, struct async_submit_ctl *submit)
+			struct page **blocks, unsigned int *offs,
+			struct async_submit_ctl *submit)
 {
 	struct dma_async_tx_descriptor *tx = NULL;
 	struct page *p, *q, *dq;
+	unsigned int p_off, q_off, dq_off;
 	u8 coef;
 	enum async_tx_flags flags = submit->flags;
 	dma_async_tx_callback cb_fn = submit->cb_fn;
@@ -418,6 +482,7 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 	void *scribble = submit->scribble;
 	int good_srcs, good, i;
 	struct page *srcs[2];
+	unsigned int src_offs[2];
 
 	pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
 
@@ -434,7 +499,7 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 			if (blocks[i] == NULL)
 				ptrs[i] = (void*)raid6_empty_zero_page;
 			else
-				ptrs[i] = page_address(blocks[i]);
+				ptrs[i] = page_address(blocks[i]) + offs[i];
 
 		raid6_datap_recov(disks, bytes, faila, ptrs);
 
@@ -458,55 +523,67 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 	BUG_ON(good_srcs == 0);
 
 	p = blocks[disks-2];
+	p_off = offs[disks-2];
 	q = blocks[disks-1];
+	q_off = offs[disks-1];
 
 	/* Compute syndrome with zero for the missing data page
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = blocks[faila];
+	dq_off = offs[faila];
 	blocks[faila] = NULL;
 	blocks[disks-1] = dq;
+	offs[disks-1] = dq_off;
 
 	/* in the 4-disk case we only need to perform a single source
 	 * multiplication with the one good data block.
 	 */
 	if (good_srcs == 1) {
 		struct page *g = blocks[good];
+		unsigned int g_off = offs[good];
 
 		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
 				  scribble);
-		tx = async_memcpy(p, g, 0, 0, bytes, submit);
+		tx = async_memcpy(p, g, p_off, g_off, bytes, submit);
 
 		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
 				  scribble);
-		tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+		tx = async_mult(dq, dq_off, g, g_off,
+				raid6_gfexp[good], bytes, submit);
 	} else {
 		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
 				  scribble);
-		tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+		tx = async_gen_syndrome(blocks, offs, disks, bytes, submit);
 	}
 
 	/* Restore pointer table */
 	blocks[faila]   = dq;
+	offs[faila] = dq_off;
 	blocks[disks-1] = q;
+	offs[disks-1] = q_off;
 
 	/* calculate g^{-faila} */
 	coef = raid6_gfinv[raid6_gfexp[faila]];
 
 	srcs[0] = dq;
+	src_offs[0] = dq_off;
 	srcs[1] = q;
+	src_offs[1] = q_off;
 	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
 			  NULL, NULL, scribble);
-	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(dq, dq_off, srcs, src_offs, 2, bytes, submit);
 
 	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
-	tx = async_mult(dq, dq, coef, bytes, submit);
+	tx = async_mult(dq, dq_off, dq, dq_off, coef, bytes, submit);
 
 	srcs[0] = p;
+	src_offs[0] = p_off;
 	srcs[1] = dq;
+	src_offs[1] = dq_off;
 	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
 			  cb_param, scribble);
-	tx = async_xor(p, srcs, 0, 2, bytes, submit);
+	tx = async_xor_offs(p, p_off, srcs, src_offs, 2, bytes, submit);
 
 	return tx;
 }
diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c
index ef4e6e68f759..66db82e5a3b1 100644
--- a/crypto/async_tx/raid6test.c
+++ b/crypto/async_tx/raid6test.c
@@ -101,12 +101,12 @@ static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
 			/* data+P failure. */
 			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
 			tx = async_raid6_datap_recov(disks, bytes,
-					faila, ptrs, &submit);
+					faila, ptrs, offs, &submit);
 		} else {
 			/* data+data failure. */
 			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
 			tx = async_raid6_2data_recov(disks, bytes,
-					faila, failb, ptrs, &submit);
+					faila, failb, ptrs, offs, &submit);
 		}
 	}
 	init_completion(&cmp);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 35f0317f0717..d36b8192940b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1685,13 +1685,13 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			return async_raid6_datap_recov(syndrome_disks+2,
 						RAID5_STRIPE_SIZE(sh->raid_conf),
 						faila,
-						blocks, &submit);
+						blocks, offs, &submit);
 		} else {
 			/* We're missing D+D. */
 			return async_raid6_2data_recov(syndrome_disks+2,
 						RAID5_STRIPE_SIZE(sh->raid_conf),
 						faila, failb,
-						blocks, &submit);
+						blocks, offs, &submit);
 		}
 	}
 }
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index efc5510f7d11..5cc73d7e5b52 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -196,11 +196,13 @@ async_syndrome_val(struct page **blocks, unsigned int *offsets, int src_cnt,
 
 struct dma_async_tx_descriptor *
 async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb,
-			struct page **ptrs, struct async_submit_ctl *submit);
+			struct page **ptrs, unsigned int *offs,
+			struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_raid6_datap_recov(int src_num, size_t bytes, int faila,
-			struct page **ptrs, struct async_submit_ctl *submit);
+			struct page **ptrs, unsigned int *offs,
+			struct async_submit_ctl *submit);
 
 void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
 #endif /* _ASYNC_TX_H_ */
-- 
cgit v1.2.3


From 046169f048cdea84fd327469239e2dfa317981b5 Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:12 -0400
Subject: md/raid5: let multiple devices of stripe_head share page

In current implementation, grow_buffers() uses alloc_page() to
allocate the buffers for each stripe_head, i.e. allocate a page
for each dev[i] in stripe_head.

After setting stripe_size as a configurable value by writing
sysfs entry, it means that we always allocate 64K buffers, but
just use 4K of them when stripe_size is 4K in 64KB arm64.

To avoid wasting memory, we try to let multiple sh->dev share
one real page. That means, multiple sh->dev[i].page will point
to the only page with different offset. Example of 64K PAGE_SIZE
and 4K stripe_size as following:

                    64K PAGE_SIZE
          +---+---+---+---+------------------------------+
          |   |   |   |   |
          |   |   |   |   |
          +-+-+-+-+-+-+-+-+------------------------------+
            ^   ^   ^   ^
            |   |   |   +----------------------------+
            |   |   |                                |
            |   |   +-------------------+            |
            |   |                       |            |
            |   +----------+            |            |
            |              |            |            |
            +-+            |            |            |
              |            |            |            |
        +-----+-----+------+-----+------+-----+------+------+
sh      | offset(0) | offset(4K) | offset(8K) | offset(12K) |
 +      +-----------+------------+------------+-------------+
 +----> dev[0].page  dev[1].page  dev[2].page  dev[3].page

A new 'pages' array will be added into stripe_head to record shared
page used by this stripe_head. Allocate them when grow_buffers()
and free them when shrink_buffers().

After trying to share page, the users of sh->dev[i].page need to take
care of the related page offset: page of issued bio and page passed
to xor compution functions. But thanks for previous different page offset
supported. Here, we just need to set correct dev[i].offset.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.h | 28 ++++++++++++++++-
 2 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d36b8192940b..ea1e56e21a31 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -448,13 +448,74 @@ out:
 	return sh;
 }
 
-static void shrink_buffers(struct stripe_head *sh)
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+static void free_stripe_pages(struct stripe_head *sh)
+{
+	int i;
+	struct page *p;
+
+	/* Have not allocate page pool */
+	if (!sh->pages)
+		return;
+
+	for (i = 0; i < sh->nr_pages; i++) {
+		p = sh->pages[i];
+		if (p)
+			put_page(p);
+		sh->pages[i] = NULL;
+	}
+}
+
+static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
 {
+	int i;
 	struct page *p;
+
+	for (i = 0; i < sh->nr_pages; i++) {
+		/* The page have allocated. */
+		if (sh->pages[i])
+			continue;
+
+		p = alloc_page(gfp);
+		if (!p) {
+			free_stripe_pages(sh);
+			return -ENOMEM;
+		}
+		sh->pages[i] = p;
+	}
+	return 0;
+}
+
+static int
+init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
+{
+	int nr_pages, cnt;
+
+	if (sh->pages)
+		return 0;
+
+	/* Each of the sh->dev[i] need one conf->stripe_size */
+	cnt = PAGE_SIZE / conf->stripe_size;
+	nr_pages = (disks + cnt - 1) / cnt;
+
+	sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!sh->pages)
+		return -ENOMEM;
+	sh->nr_pages = nr_pages;
+	sh->stripes_per_page = cnt;
+	return 0;
+}
+#endif
+
+static void shrink_buffers(struct stripe_head *sh)
+{
 	int i;
 	int num = sh->raid_conf->pool_size;
 
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 	for (i = 0; i < num ; i++) {
+		struct page *p;
+
 		WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 		p = sh->dev[i].page;
 		if (!p)
@@ -462,6 +523,11 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
+#else
+	for (i = 0; i < num; i++)
+		sh->dev[i].page = NULL;
+	free_stripe_pages(sh); /* Free pages */
+#endif
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -469,6 +535,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 	int i;
 	int num = sh->raid_conf->pool_size;
 
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 	for (i = 0; i < num; i++) {
 		struct page *page;
 
@@ -479,7 +546,16 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].orig_page = page;
 		sh->dev[i].offset = 0;
 	}
+#else
+	if (alloc_stripe_pages(sh, gfp))
+		return -ENOMEM;
 
+	for (i = 0; i < num; i++) {
+		sh->dev[i].page = raid5_get_dev_page(sh, i);
+		sh->dev[i].orig_page = sh->dev[i].page;
+		sh->dev[i].offset = raid5_get_page_offset(sh, i);
+	}
+#endif
 	return 0;
 }
 
@@ -2205,6 +2281,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 
 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
 {
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+	kfree(sh->pages);
+#endif
 	if (sh->ppl_page)
 		__free_page(sh->ppl_page);
 	kmem_cache_free(sc, sh);
@@ -2238,9 +2317,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 			sh->ppl_page = alloc_page(gfp);
 			if (!sh->ppl_page) {
 				free_stripe(sc, sh);
-				sh = NULL;
+				return NULL;
 			}
 		}
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+		if (init_stripe_shared_pages(sh, conf, disks)) {
+			free_stripe(sc, sh);
+			return NULL;
+		}
+#endif
 	}
 	return sh;
 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6afdd3a78580..5c05acf20e1f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -195,6 +195,7 @@ enum reconstruct_states {
 	reconstruct_state_result,
 };
 
+#define DEFAULT_STRIPE_SIZE	4096
 struct stripe_head {
 	struct hlist_node	hash;
 	struct list_head	lru;	      /* inactive_list or handle_list */
@@ -246,6 +247,13 @@ struct stripe_head {
 		int 		     target, target2;
 		enum sum_check_flags zero_sum_result;
 	} ops;
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+	/* These pages will be used by bios in dev[i] */
+	struct page	**pages;
+	int	nr_pages;	/* page array size */
+	int	stripes_per_page;
+#endif
 	struct r5dev {
 		/* rreq and rvec are used for the replacement device when
 		 * writing data to both devices.
@@ -473,7 +481,6 @@ struct disk_info {
  */
 
 #define NR_STRIPES		256
-#define DEFAULT_STRIPE_SIZE	4096
 
 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 #define STRIPE_SIZE		PAGE_SIZE
@@ -772,6 +779,25 @@ static inline int algorithm_is_DDF(int layout)
 	return layout >= 8 && layout <= 10;
 }
 
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+/*
+ * Return offset of the corresponding page for r5dev.
+ */
+static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx)
+{
+	return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf);
+}
+
+/*
+ * Return corresponding page address for r5dev.
+ */
+static inline struct page *
+raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
+{
+	return sh->pages[disk_idx / sh->stripes_per_page];
+}
+#endif
+
 extern void md_raid5_kick_device(struct r5conf *conf);
 extern int raid5_set_cache_size(struct mddev *mddev, int size);
 extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
-- 
cgit v1.2.3


From f16acaf328c5615fdaea74f9bd0b4019544532d6 Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:13 -0400
Subject: md/raid5: resize stripe_head when reshape array

When reshape array, we try to reuse shared pages of old stripe_head,
and allocate more for the new one if needed.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ea1e56e21a31..c23e408f7623 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2535,6 +2535,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		osh = get_free_stripe(conf, hash);
 		unlock_device_hash_lock(conf, hash);
 
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+	for (i = 0; i < osh->nr_pages; i++) {
+		nsh->pages[i] = osh->pages[i];
+		osh->pages[i] = NULL;
+	}
+#endif
 		for(i=0; i<conf->pool_size; i++) {
 			nsh->dev[i].page = osh->dev[i].page;
 			nsh->dev[i].orig_page = osh->dev[i].page;
@@ -2589,6 +2595,23 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
 		list_del_init(&nsh->lru);
 
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+		for (i = 0; i < nsh->nr_pages; i++) {
+			if (nsh->pages[i])
+				continue;
+			nsh->pages[i] = alloc_page(GFP_NOIO);
+			if (!nsh->pages[i])
+				err = -ENOMEM;
+		}
+
+		for (i = conf->raid_disks; i < newsize; i++) {
+			if (nsh->dev[i].page)
+				continue;
+			nsh->dev[i].page = raid5_get_dev_page(nsh, i);
+			nsh->dev[i].orig_page = nsh->dev[i].page;
+			nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
+		}
+#else
 		for (i=conf->raid_disks; i < newsize; i++)
 			if (nsh->dev[i].page == NULL) {
 				struct page *p = alloc_page(GFP_NOIO);
@@ -2598,6 +2621,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 				if (!p)
 					err = -ENOMEM;
 			}
+#endif
 		raid5_release_stripe(nsh);
 	}
 	/* critical section pass, GFP_NOIO no longer needed */
-- 
cgit v1.2.3


From 389125844352c784abf8c186589343466d03cd6a Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 20 Aug 2020 09:22:14 -0400
Subject: md/raid5: reallocate page array after setting new stripe_size

When try to resize stripe_size, we also need to free old
shared page array and allocate new.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c23e408f7623..66690b40818e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6682,6 +6682,7 @@ raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
 	struct r5conf *conf;
 	unsigned long new;
 	int err;
+	int size;
 
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
@@ -6714,10 +6715,29 @@ raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
 	pr_debug("md/raid: change stripe_size from %lu to %lu\n",
 			conf->stripe_size, new);
 
+	if (mddev->sync_thread ||
+		test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+		mddev->reshape_position != MaxSector ||
+		mddev->sysfs_active) {
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
 	mddev_suspend(mddev);
+	mutex_lock(&conf->cache_size_mutex);
+	size = conf->max_nr_stripes;
+
+	shrink_stripes(conf);
+
 	conf->stripe_size = new;
 	conf->stripe_shift = ilog2(new) - 9;
 	conf->stripe_sectors = new >> 9;
+	if (grow_stripes(conf, size)) {
+		pr_warn("md/raid:%s: couldn't allocate buffers\n",
+				mdname(mddev));
+		err = -ENOMEM;
+	}
+	mutex_unlock(&conf->cache_size_mutex);
 	mddev_resume(mddev);
 
 out_unlock:
-- 
cgit v1.2.3


From e287308b8388590ad70e8e88d1cc70a4643bacd7 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Fri, 21 Aug 2020 09:29:18 +0800
Subject: md: Simplify code with existing definition RESYNC_SECTORS in raid10.c

#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)

"RESYNC_BLOCK_SIZE/512" is equal to "RESYNC_BLOCK_SIZE >> 9", replace it
with RESYNC_SECTORS.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5d1bdee313ec..3b598a3cb462 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4458,8 +4458,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		last = conf->reshape_progress - 1;
 		sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
 					       & conf->prev.chunk_mask);
-		if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
-			sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
+		if (sector_nr + RESYNC_SECTORS < last)
+			sector_nr = last + 1 - RESYNC_SECTORS;
 	} else {
 		/* 'next' is after the last device address that we
 		 * might write to for this chunk in the new layout
@@ -4481,8 +4481,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		last  = sector_nr | (conf->geo.chunk_mask
 				     & conf->prev.chunk_mask);
 
-		if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
-			last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
+		if (sector_nr + RESYNC_SECTORS <= last)
+			last = sector_nr + RESYNC_SECTORS - 1;
 	}
 
 	if (need_flush ||
-- 
cgit v1.2.3


From 2628089b74d5a64bd0bcb5d247a18f78d7b6f4d0 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Tue, 25 Aug 2020 13:42:59 +0800
Subject: md: add md_submit_discard_bio() for submitting discard bio

Move these logic from raid0.c to md.c, so that we can also use it in
raid10.c.

Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c    | 20 ++++++++++++++++++++
 drivers/md/md.h    |  2 ++
 drivers/md/raid0.c | 14 ++------------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 64bc22d2b606..dfff4bb2bb20 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8583,6 +8583,26 @@ void md_write_end(struct mddev *mddev)
 
 EXPORT_SYMBOL(md_write_end);
 
+/* This is used by raid0 and raid10 */
+void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
+			struct bio *bio, sector_t start, sector_t size)
+{
+	struct bio *discard_bio = NULL;
+
+	if (__blkdev_issue_discard(rdev->bdev, start, size,
+		GFP_NOIO, 0, &discard_bio) || !discard_bio)
+		return;
+
+	bio_chain(discard_bio, bio);
+	bio_clone_blkg_association(discard_bio, bio);
+	if (mddev->gendisk)
+		trace_block_bio_remap(bdev_get_queue(rdev->bdev),
+			discard_bio, disk_devt(mddev->gendisk),
+			bio->bi_iter.bi_sector);
+	submit_bio_noacct(discard_bio);
+}
+EXPORT_SYMBOL(md_submit_discard_bio);
+
 /* md_allow_write(mddev)
  * Calling this ensures that the array is marked 'active' so that writes
  * may proceed without blocking.  It is important to call this before
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f9e2ccdd22c4..543366041911 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -713,6 +713,8 @@ extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 extern void md_finish_reshape(struct mddev *mddev);
+extern void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
+			struct bio *bio, sector_t start, sector_t size);
 
 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index aa2d72791768..e2de0cced707 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -494,7 +494,6 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 
 	for (disk = 0; disk < zone->nb_dev; disk++) {
 		sector_t dev_start, dev_end;
-		struct bio *discard_bio = NULL;
 		struct md_rdev *rdev;
 
 		if (disk < start_disk_index)
@@ -517,18 +516,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 
 		rdev = conf->devlist[(zone - conf->strip_zone) *
 			conf->strip_zone[0].nb_dev + disk];
-		if (__blkdev_issue_discard(rdev->bdev,
+		md_submit_discard_bio(mddev, rdev, bio,
 			dev_start + zone->dev_start + rdev->data_offset,
-			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
-		    !discard_bio)
-			continue;
-		bio_chain(discard_bio, bio);
-		bio_clone_blkg_association(discard_bio, bio);
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
-				discard_bio, disk_devt(mddev->gendisk),
-				bio->bi_iter.bi_sector);
-		submit_bio_noacct(discard_bio);
+			dev_end - dev_start);
 	}
 	bio_endio(bio);
 }
-- 
cgit v1.2.3


From 8650a889017cb1f6ea6813ccf83a2e9f6fa49dd3 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Tue, 25 Aug 2020 13:43:00 +0800
Subject: md/raid10: extend r10bio devs to raid disks

Now it allocs r10bio->devs[conf->copies]. Discard bio needs to submit
to all member disks and it needs to use r10bio. So extend to
r10bio->devs[geo.raid_disks].

Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3b598a3cb462..f2ec44fda1a0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio)
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
-	int size = offsetof(struct r10bio, devs[conf->copies]);
+	int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
 
 	/* allocate a r10bio with room for raid_disks entries in the
 	 * bios array */
@@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	int i;
 
-	for (i = 0; i < conf->copies; i++) {
+	for (i = 0; i < conf->geo.raid_disks; i++) {
 		struct bio **bio = & r10_bio->devs[i].bio;
 		if (!BIO_SPECIAL(*bio))
 			bio_put(*bio);
@@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 	int slot;
 	int repl = 0;
 
-	for (slot = 0; slot < conf->copies; slot++) {
+	for (slot = 0; slot < conf->geo.raid_disks; slot++) {
 		if (r10_bio->devs[slot].bio == bio)
 			break;
 		if (r10_bio->devs[slot].repl_bio == bio) {
@@ -336,7 +336,6 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 		}
 	}
 
-	BUG_ON(slot == conf->copies);
 	update_head_pos(slot, r10_bio);
 
 	if (slotp)
@@ -1493,7 +1492,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
-	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
 
 	if (bio_data_dir(bio) == READ)
 		raid10_read_request(mddev, bio, r10_bio);
-- 
cgit v1.2.3


From f046f5d0d79cdb968f219ce249e497fd1accf484 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Tue, 25 Aug 2020 13:43:01 +0800
Subject: md/raid10: pull codes that wait for blocked dev into one function

The following patch will reuse these logics, so pull the same codes into
one function.

Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 118 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 67 insertions(+), 51 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f2ec44fda1a0..4ad8447fa868 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1275,12 +1275,75 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	}
 }
 
+static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
+{
+	int i;
+	struct r10conf *conf = mddev->private;
+	struct md_rdev *blocked_rdev;
+
+retry_wait:
+	blocked_rdev = NULL;
+	rcu_read_lock();
+	for (i = 0; i < conf->copies; i++) {
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		struct md_rdev *rrdev = rcu_dereference(
+			conf->mirrors[i].replacement);
+		if (rdev == rrdev)
+			rrdev = NULL;
+		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+			atomic_inc(&rdev->nr_pending);
+			blocked_rdev = rdev;
+			break;
+		}
+		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+			atomic_inc(&rrdev->nr_pending);
+			blocked_rdev = rrdev;
+			break;
+		}
+
+		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
+			sector_t first_bad;
+			sector_t dev_sector = r10_bio->devs[i].addr;
+			int bad_sectors;
+			int is_bad;
+
+			/* Discard request doesn't care the write result
+			 * so it doesn't need to wait blocked disk here.
+			 */
+			if (!r10_bio->sectors)
+				continue;
+
+			is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
+					     &first_bad, &bad_sectors);
+			if (is_bad < 0) {
+				/* Mustn't write here until the bad block
+				 * is acknowledged
+				 */
+				atomic_inc(&rdev->nr_pending);
+				set_bit(BlockedBadBlocks, &rdev->flags);
+				blocked_rdev = rdev;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	if (unlikely(blocked_rdev)) {
+		/* Have to wait for this device to get unblocked, then retry */
+		allow_barrier(conf);
+		raid10_log(conf->mddev, "%s wait rdev %d blocked",
+				__func__, blocked_rdev->raid_disk);
+		md_wait_for_blocked_rdev(blocked_rdev, mddev);
+		wait_barrier(conf);
+		goto retry_wait;
+	}
+}
+
 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 				 struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
 	int i;
-	struct md_rdev *blocked_rdev;
 	sector_t sectors;
 	int max_sectors;
 
@@ -1338,8 +1401,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
 	raid10_find_phys(conf, r10_bio);
-retry_write:
-	blocked_rdev = NULL;
+
+	wait_blocked_dev(mddev, r10_bio);
+
 	rcu_read_lock();
 	max_sectors = r10_bio->sectors;
 
@@ -1350,16 +1414,6 @@ retry_write:
 			conf->mirrors[d].replacement);
 		if (rdev == rrdev)
 			rrdev = NULL;
-		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			atomic_inc(&rdev->nr_pending);
-			blocked_rdev = rdev;
-			break;
-		}
-		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
-			atomic_inc(&rrdev->nr_pending);
-			blocked_rdev = rrdev;
-			break;
-		}
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@ -1380,15 +1434,6 @@ retry_write:
 
 			is_bad = is_badblock(rdev, dev_sector, max_sectors,
 					     &first_bad, &bad_sectors);
-			if (is_bad < 0) {
-				/* Mustn't write here until the bad block
-				 * is acknowledged
-				 */
-				atomic_inc(&rdev->nr_pending);
-				set_bit(BlockedBadBlocks, &rdev->flags);
-				blocked_rdev = rdev;
-				break;
-			}
 			if (is_bad && first_bad <= dev_sector) {
 				/* Cannot write here at all */
 				bad_sectors -= (dev_sector - first_bad);
@@ -1424,35 +1469,6 @@ retry_write:
 	}
 	rcu_read_unlock();
 
-	if (unlikely(blocked_rdev)) {
-		/* Have to wait for this device to get unblocked, then retry */
-		int j;
-		int d;
-
-		for (j = 0; j < i; j++) {
-			if (r10_bio->devs[j].bio) {
-				d = r10_bio->devs[j].devnum;
-				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
-			}
-			if (r10_bio->devs[j].repl_bio) {
-				struct md_rdev *rdev;
-				d = r10_bio->devs[j].devnum;
-				rdev = conf->mirrors[d].replacement;
-				if (!rdev) {
-					/* Race with remove_disk */
-					smp_mb();
-					rdev = conf->mirrors[d].rdev;
-				}
-				rdev_dec_pending(rdev, mddev);
-			}
-		}
-		allow_barrier(conf);
-		raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
-		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
-		goto retry_write;
-	}
-
 	if (max_sectors < r10_bio->sectors)
 		r10_bio->sectors = max_sectors;
 
-- 
cgit v1.2.3


From bcc90d280465ebd51ab8688be86e1f00c62dccf9 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Wed, 2 Sep 2020 20:00:22 +0800
Subject: md/raid10: improve raid10 discard request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now the discard request is split by chunk size. So it takes a long time
to finish mkfs on disks which support discard function. This patch improve
handling raid10 discard request. It uses the similar way with patch
29efc390b (md/md0: optimize raid0 discard handling).

But it's a little complex than raid0. Because raid10 has different layout.
If raid10 is offset layout and the discard request is smaller than stripe
size. There are some holes when we submit discard bio to underlayer disks.

For example: five disks (disk1 - disk5)
D01 D02 D03 D04 D05
D05 D01 D02 D03 D04
D06 D07 D08 D09 D10
D10 D06 D07 D08 D09
The discard bio just wants to discard from D03 to D10. For disk3, there is
a hole between D03 and D08. For disk4, there is a hole between D04 and D09.
D03 is a chunk, raid10_write_request can handle one chunk perfectly. So
the part that is not aligned with stripe size is still handled by
raid10_write_request.

If reshape is running when discard bio comes and the discard bio spans the
reshape position, raid10_write_request is responsible to handle this
discard bio.

I did a test with this patch set.
Without patch:
time mkfs.xfs /dev/md0
real4m39.775s
user0m0.000s
sys0m0.298s

With patch:
time mkfs.xfs /dev/md0
real0m0.105s
user0m0.000s
sys0m0.007s

nvme3n1           259:1    0   477G  0 disk
└─nvme3n1p1       259:10   0    50G  0 part
nvme4n1           259:2    0   477G  0 disk
└─nvme4n1p1       259:11   0    50G  0 part
nvme5n1           259:6    0   477G  0 disk
└─nvme5n1p1       259:12   0    50G  0 part
nvme2n1           259:9    0   477G  0 disk
└─nvme2n1p1       259:15   0    50G  0 part
nvme0n1           259:13   0   477G  0 disk
└─nvme0n1p1       259:14   0    50G  0 part

Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 255 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4ad8447fa868..05773ee1b5ba 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1516,6 +1516,256 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 		raid10_write_request(mddev, bio, r10_bio);
 }
 
+static struct bio *raid10_split_bio(struct r10conf *conf,
+			struct bio *bio, sector_t sectors, bool want_first)
+{
+	struct bio *split;
+
+	split = bio_split(bio, sectors,	GFP_NOIO, &conf->bio_split);
+	bio_chain(split, bio);
+	allow_barrier(conf);
+	if (want_first) {
+		submit_bio_noacct(bio);
+		bio = split;
+	} else
+		submit_bio_noacct(split);
+	wait_barrier(conf);
+
+	return bio;
+}
+
+static void raid10_end_discard_request(struct bio *bio)
+{
+	struct r10bio *r10_bio = bio->bi_private;
+	struct r10conf *conf = r10_bio->mddev->private;
+	struct md_rdev *rdev = NULL;
+	int dev;
+	int slot, repl;
+
+	/*
+	 * We don't care the return value of discard bio
+	 */
+	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+		set_bit(R10BIO_Uptodate, &r10_bio->state);
+
+	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+	if (repl)
+		rdev = conf->mirrors[dev].replacement;
+	if (!rdev) {
+		/* raid10_remove_disk uses smp_mb to make sure rdev is set to
+		 * replacement before setting replacement to NULL. It can read
+		 * rdev first without barrier protect even replacment is NULL
+		 */
+		smp_rmb();
+		rdev = conf->mirrors[dev].rdev;
+	}
+
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		md_write_end(r10_bio->mddev);
+		raid_end_bio_io(r10_bio);
+	}
+
+	rdev_dec_pending(rdev, conf->mddev);
+}
+
+/* There are some limitations to handle discard bio
+ * 1st, the discard size is bigger than stripe_size*2.
+ * 2st, if the discard bio spans reshape progress, we use the old way to
+ * handle discard bio
+ */
+static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+	struct r10conf *conf = mddev->private;
+	struct geom *geo = &conf->geo;
+	struct r10bio *r10_bio;
+
+	int disk;
+	sector_t chunk;
+	unsigned int stripe_size;
+	sector_t split_size;
+
+	sector_t bio_start, bio_end;
+	sector_t first_stripe_index, last_stripe_index;
+	sector_t start_disk_offset;
+	unsigned int start_disk_index;
+	sector_t end_disk_offset;
+	unsigned int end_disk_index;
+	unsigned int remainder;
+
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		return -EAGAIN;
+
+	wait_barrier(conf);
+
+	/* Check reshape again to avoid reshape happens after checking
+	 * MD_RECOVERY_RESHAPE and before wait_barrier
+	 */
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		goto out;
+
+	stripe_size = geo->raid_disks << geo->chunk_shift;
+	bio_start = bio->bi_iter.bi_sector;
+	bio_end = bio_end_sector(bio);
+
+	/* Maybe one discard bio is smaller than strip size or across one stripe
+	 * and discard region is larger than one stripe size. For far offset layout,
+	 * if the discard region is not aligned with stripe size, there is hole
+	 * when we submit discard bio to member disk. For simplicity, we only
+	 * handle discard bio which discard region is bigger than stripe_size*2
+	 */
+	if (bio_sectors(bio) < stripe_size*2)
+		goto out;
+
+	/* For far offset layout, if bio is not aligned with stripe size, it splits
+	 * the part that is not aligned with strip size.
+	 */
+	div_u64_rem(bio_start, stripe_size, &remainder);
+	if (geo->far_offset && remainder) {
+		split_size = stripe_size - remainder;
+		bio = raid10_split_bio(conf, bio, split_size, false);
+	}
+	div_u64_rem(bio_end, stripe_size, &remainder);
+	if (geo->far_offset && remainder) {
+		split_size = bio_sectors(bio) - remainder;
+		bio = raid10_split_bio(conf, bio, split_size, true);
+	}
+
+	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+	r10_bio->mddev = mddev;
+	r10_bio->state = 0;
+	r10_bio->sectors = 0;
+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
+
+	wait_blocked_dev(mddev, r10_bio);
+
+	r10_bio->master_bio = bio;
+
+	bio_start = bio->bi_iter.bi_sector;
+	bio_end = bio_end_sector(bio);
+
+	/* raid10 uses chunk as the unit to store data. It's similar like raid0.
+	 * One stripe contains the chunks from all member disk (one chunk from
+	 * one disk at the same HBA address). For layout detail, see 'man md 4'
+	 */
+	chunk = bio_start >> geo->chunk_shift;
+	chunk *= geo->near_copies;
+	first_stripe_index = chunk;
+	start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
+	if (geo->far_offset)
+		first_stripe_index *= geo->far_copies;
+	start_disk_offset = (bio_start & geo->chunk_mask) +
+				(first_stripe_index << geo->chunk_shift);
+
+	chunk = bio_end >> geo->chunk_shift;
+	chunk *= geo->near_copies;
+	last_stripe_index = chunk;
+	end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
+	if (geo->far_offset)
+		last_stripe_index *= geo->far_copies;
+	end_disk_offset = (bio_end & geo->chunk_mask) +
+				(last_stripe_index << geo->chunk_shift);
+
+	rcu_read_lock();
+	for (disk = 0; disk < geo->raid_disks; disk++) {
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+		struct md_rdev *rrdev = rcu_dereference(
+			conf->mirrors[disk].replacement);
+
+		r10_bio->devs[disk].bio = NULL;
+		r10_bio->devs[disk].repl_bio = NULL;
+
+		if (rdev && (test_bit(Faulty, &rdev->flags)))
+			rdev = NULL;
+		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
+			rrdev = NULL;
+		if (!rdev && !rrdev)
+			continue;
+
+		if (rdev) {
+			r10_bio->devs[disk].bio = bio;
+			atomic_inc(&rdev->nr_pending);
+		}
+		if (rrdev) {
+			r10_bio->devs[disk].repl_bio = bio;
+			atomic_inc(&rrdev->nr_pending);
+		}
+	}
+	rcu_read_unlock();
+
+	atomic_set(&r10_bio->remaining, 1);
+	for (disk = 0; disk < geo->raid_disks; disk++) {
+		sector_t dev_start, dev_end;
+		struct bio *mbio, *rbio = NULL;
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+		struct md_rdev *rrdev = rcu_dereference(
+			conf->mirrors[disk].replacement);
+
+		/*
+		 * Now start to calculate the start and end address for each disk.
+		 * The space between dev_start and dev_end is the discard region.
+		 *
+		 * For dev_start, it needs to consider three conditions:
+		 * 1st, the disk is before start_disk, you can imagine the disk in
+		 * the next stripe. So the dev_start is the start address of next
+		 * stripe.
+		 * 2st, the disk is after start_disk, it means the disk is at the
+		 * same stripe of first disk
+		 * 3st, the first disk itself, we can use start_disk_offset directly
+		 */
+		if (disk < start_disk_index)
+			dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
+		else if (disk > start_disk_index)
+			dev_start = first_stripe_index * mddev->chunk_sectors;
+		else
+			dev_start = start_disk_offset;
+
+		if (disk < end_disk_index)
+			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+		else if (disk > end_disk_index)
+			dev_end = last_stripe_index * mddev->chunk_sectors;
+		else
+			dev_end = end_disk_offset;
+
+		/* It only handles discard bio which size is >= stripe size, so
+		 * dev_end > dev_start all the time
+		 */
+		if (r10_bio->devs[disk].bio) {
+			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+			mbio->bi_end_io = raid10_end_discard_request;
+			mbio->bi_private = r10_bio;
+			r10_bio->devs[disk].bio = mbio;
+			r10_bio->devs[disk].devnum = disk;
+			atomic_inc(&r10_bio->remaining);
+			md_submit_discard_bio(mddev, rdev, mbio,
+					dev_start + choose_data_offset(r10_bio, rdev),
+					dev_end - dev_start);
+			bio_endio(mbio);
+		}
+		if (r10_bio->devs[disk].repl_bio) {
+			rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+			rbio->bi_end_io = raid10_end_discard_request;
+			rbio->bi_private = r10_bio;
+			r10_bio->devs[disk].repl_bio = rbio;
+			r10_bio->devs[disk].devnum = disk;
+			atomic_inc(&r10_bio->remaining);
+			md_submit_discard_bio(mddev, rrdev, rbio,
+					dev_start + choose_data_offset(r10_bio, rrdev),
+					dev_end - dev_start);
+			bio_endio(rbio);
+		}
+	}
+
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		md_write_end(r10_bio->mddev);
+		raid_end_bio_io(r10_bio);
+	}
+
+	return 0;
+out:
+	allow_barrier(conf);
+	return -EAGAIN;
+}
+
 static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
@@ -1530,6 +1780,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
 	if (!md_write_start(mddev, bio))
 		return false;
 
+	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+		if (!raid10_handle_discard(mddev, bio))
+			return true;
+
 	/*
 	 * If this request crosses a chunk boundary, we need to split
 	 * it.
@@ -3769,7 +4023,7 @@ static int raid10_run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
-					      mddev->chunk_sectors);
+					      UINT_MAX);
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
-- 
cgit v1.2.3


From d3ee2d8415a6256c1c41e1be36e80e640c3e6359 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Wed, 2 Sep 2020 20:00:23 +0800
Subject: md/raid10: improve discard request for far layout

For far layout, the discard region is not continuous on disks. So it needs
far copies r10bio to cover all regions. It needs a way to know all r10bios
have finish or not. Similar with raid10_sync_request, only the first r10bio
master_bio records the discard bio. Other r10bios master_bio record the
first r10bio. The first r10bio can finish after other r10bios finish and
then return the discard bio.

Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 86 +++++++++++++++++++++++++++++++++++++++--------------
 drivers/md/raid10.h |  1 +
 2 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 05773ee1b5ba..b7bca6703df8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1534,6 +1534,28 @@ static struct bio *raid10_split_bio(struct r10conf *conf,
 	return bio;
 }
 
+static void raid_end_discard_bio(struct r10bio *r10bio)
+{
+	struct r10conf *conf = r10bio->mddev->private;
+	struct r10bio *first_r10bio;
+
+	while (atomic_dec_and_test(&r10bio->remaining)) {
+
+		allow_barrier(conf);
+
+		if (!test_bit(R10BIO_Discard, &r10bio->state)) {
+			first_r10bio = (struct r10bio *)r10bio->master_bio;
+			free_r10bio(r10bio);
+			r10bio = first_r10bio;
+		} else {
+			md_write_end(r10bio->mddev);
+			bio_endio(r10bio->master_bio);
+			free_r10bio(r10bio);
+			break;
+		}
+	}
+}
+
 static void raid10_end_discard_request(struct bio *bio)
 {
 	struct r10bio *r10_bio = bio->bi_private;
@@ -1560,11 +1582,7 @@ static void raid10_end_discard_request(struct bio *bio)
 		rdev = conf->mirrors[dev].rdev;
 	}
 
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		md_write_end(r10_bio->mddev);
-		raid_end_bio_io(r10_bio);
-	}
-
+	raid_end_discard_bio(r10_bio);
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
@@ -1577,7 +1595,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
 	struct geom *geo = &conf->geo;
-	struct r10bio *r10_bio;
+	struct r10bio *r10_bio, *first_r10bio;
+	int far_copies = geo->far_copies;
+	bool first_copy = true;
 
 	int disk;
 	sector_t chunk;
@@ -1616,30 +1636,20 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	if (bio_sectors(bio) < stripe_size*2)
 		goto out;
 
-	/* For far offset layout, if bio is not aligned with stripe size, it splits
-	 * the part that is not aligned with strip size.
+	/* For far and far offset layout, if bio is not aligned with stripe size,
+	 * it splits the part that is not aligned with strip size.
 	 */
 	div_u64_rem(bio_start, stripe_size, &remainder);
-	if (geo->far_offset && remainder) {
+	if ((far_copies > 1) && remainder) {
 		split_size = stripe_size - remainder;
 		bio = raid10_split_bio(conf, bio, split_size, false);
 	}
 	div_u64_rem(bio_end, stripe_size, &remainder);
-	if (geo->far_offset && remainder) {
+	if ((far_copies > 1) && remainder) {
 		split_size = bio_sectors(bio) - remainder;
 		bio = raid10_split_bio(conf, bio, split_size, true);
 	}
 
-	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
-	r10_bio->mddev = mddev;
-	r10_bio->state = 0;
-	r10_bio->sectors = 0;
-	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
-
-	wait_blocked_dev(mddev, r10_bio);
-
-	r10_bio->master_bio = bio;
-
 	bio_start = bio->bi_iter.bi_sector;
 	bio_end = bio_end_sector(bio);
 
@@ -1665,6 +1675,28 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	end_disk_offset = (bio_end & geo->chunk_mask) +
 				(last_stripe_index << geo->chunk_shift);
 
+retry_discard:
+	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+	r10_bio->mddev = mddev;
+	r10_bio->state = 0;
+	r10_bio->sectors = 0;
+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
+	wait_blocked_dev(mddev, r10_bio);
+
+	/* For far layout it needs more than one r10bio to cover all regions.
+	 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
+	 * to record the discard bio. Other r10bio->master_bio record the first
+	 * r10bio. The first r10bio only release after all other r10bios finish.
+	 * The discard bio returns only first r10bio finishes
+	 */
+	if (first_copy) {
+		r10_bio->master_bio = bio;
+		set_bit(R10BIO_Discard, &r10_bio->state);
+		first_copy = false;
+		first_r10bio = r10_bio;
+	} else
+		r10_bio->master_bio = (struct bio *)first_r10bio;
+
 	rcu_read_lock();
 	for (disk = 0; disk < geo->raid_disks; disk++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
@@ -1755,11 +1787,19 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		}
 	}
 
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		md_write_end(r10_bio->mddev);
-		raid_end_bio_io(r10_bio);
+	if (!geo->far_offset && --far_copies) {
+		first_stripe_index += geo->stride >> geo->chunk_shift;
+		start_disk_offset += geo->stride;
+		last_stripe_index += geo->stride >> geo->chunk_shift;
+		end_disk_offset += geo->stride;
+		atomic_inc(&first_r10bio->remaining);
+		raid_end_discard_bio(r10_bio);
+		wait_barrier(conf);
+		goto retry_discard;
 	}
 
+	raid_end_discard_bio(r10_bio);
+
 	return 0;
 out:
 	allow_barrier(conf);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 79cd2b7d3128..1461fd55311b 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -179,5 +179,6 @@ enum r10bio_state {
 	R10BIO_Previous,
 /* failfast devices did receive failfast requests. */
 	R10BIO_FailFast,
+	R10BIO_Discard,
 };
 #endif
-- 
cgit v1.2.3


From b2702aaaa45c2d1028bc8c51b49106041be7aae5 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 16 Sep 2020 18:11:02 -0700
Subject: nvme: lift the file open code from nvme_ctrl_get_by_path

Lift opening the file open/close code from nvme_ctrl_get_by_path into
the caller, just keeping a simple nvme_ctrl_from_file() helper.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
[hch: refactored a bit, split the bug fixes into a separate prep patch]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/nvme/host/core.c       | 25 +++++--------------------
 drivers/nvme/host/nvme.h       |  2 +-
 drivers/nvme/target/passthru.c | 27 ++++++++++++++++-----------
 3 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 64f4bf85c3d1..35cd177bda08 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4668,28 +4668,13 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_sync_queues);
 
-struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path)
+struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
 {
-	struct nvme_ctrl *ctrl;
-	struct file *f;
-
-	f = filp_open(path, O_RDWR, 0);
-	if (IS_ERR(f))
-		return ERR_CAST(f);
-
-	if (f->f_op != &nvme_dev_fops) {
-		ctrl = ERR_PTR(-EINVAL);
-		goto out_close;
-	}
-
-	ctrl = f->private_data;
-	nvme_get_ctrl(ctrl);
-
-out_close:
-	filp_close(f, NULL);
-	return ctrl;
+	if (file->f_op != &nvme_dev_fops)
+		return NULL;
+	return file->private_data;
 }
-EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
 
 /*
  * Check we didn't inadvertently grow the command structure sizes:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 398394bcbee4..4749baa457bc 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -822,7 +822,7 @@ static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			 u8 opcode);
 void nvme_execute_passthru_rq(struct request *rq);
-struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path);
+struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
 void nvme_put_ns(struct nvme_ns *ns);
 
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index dacfa7435d0b..e3a5f8499a0b 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -474,6 +474,7 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
 int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
 {
 	struct nvme_ctrl *ctrl;
+	struct file *file;
 	int ret = -EINVAL;
 	void *old;
 
@@ -488,24 +489,29 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
 		goto out_unlock;
 	}
 
-	ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path);
-	if (IS_ERR(ctrl)) {
-		ret = PTR_ERR(ctrl);
+	file = filp_open(subsys->passthru_ctrl_path, O_RDWR, 0);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto out_unlock;
+	}
+
+	ctrl = nvme_ctrl_from_file(file);
+	if (!ctrl) {
 		pr_err("failed to open nvme controller %s\n",
 		       subsys->passthru_ctrl_path);
 
-		goto out_unlock;
+		goto out_put_file;
 	}
 
 	old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL,
 			 subsys, GFP_KERNEL);
 	if (xa_is_err(old)) {
 		ret = xa_err(old);
-		goto out_put_ctrl;
+		goto out_put_file;
 	}
 
 	if (old)
-		goto out_put_ctrl;
+		goto out_put_file;
 
 	subsys->passthru_ctrl = ctrl;
 	subsys->ver = ctrl->vs;
@@ -516,13 +522,12 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
 			NVME_TERTIARY(subsys->ver));
 		subsys->ver = NVME_VS(1, 2, 1);
 	}
-
+	nvme_get_ctrl(ctrl);
 	__module_get(subsys->passthru_ctrl->ops->module);
-	mutex_unlock(&subsys->lock);
-	return 0;
+	ret = 0;
 
-out_put_ctrl:
-	nvme_put_ctrl(ctrl);
+out_put_file:
+	filp_close(file, NULL);
 out_unlock:
 	mutex_unlock(&subsys->lock);
 	return ret;
-- 
cgit v1.2.3


From 1cf7a12e09aa48512404bf3f0f429b73182ce793 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 22 Sep 2020 14:05:29 -0700
Subject: nvme: use an xarray to lookup the Commands Supported and Effects log

When using linked list we have to open code the locking, search, and
destroy operations with the loops even if data structure doesn't fall
into the fast path.

One of the main advantage of having XArray to store, search, and remove
items is that it handles all the locking by itself, avoids the loops
when using linked lists, provides clear API to replace the linked list's
search and destroy loops.

This patch replaces the ctrl->cel list with XArray and removes :-

a. Extra code needed for the linked list for ctrl->cel item management
   such as nvme_find_cel().
b. Destroy loop in the nvme_free_ctrl().
c. Explicit insertion locking in the nvme_get_effects_log().

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 31 ++++---------------------------
 drivers/nvme/host/nvme.h |  2 +-
 2 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 35cd177bda08..0a985c601c62 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3025,26 +3025,10 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 }
 
-static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
-{
-	struct nvme_cel *cel, *ret = NULL;
-
-	spin_lock_irq(&ctrl->lock);
-	list_for_each_entry(cel, &ctrl->cels, entry) {
-		if (cel->csi == csi) {
-			ret = cel;
-			break;
-		}
-	}
-	spin_unlock_irq(&ctrl->lock);
-
-	return ret;
-}
-
 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
 				struct nvme_effects_log **log)
 {
-	struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
+	struct nvme_cel *cel = xa_load(&ctrl->cels, csi);
 	int ret;
 
 	if (cel)
@@ -3062,10 +3046,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
 	}
 
 	cel->csi = csi;
-
-	spin_lock_irq(&ctrl->lock);
-	list_add_tail(&cel->entry, &ctrl->cels);
-	spin_unlock_irq(&ctrl->lock);
+	xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL);
 out:
 	*log = &cel->log;
 	return 0;
@@ -4448,15 +4429,11 @@ static void nvme_free_ctrl(struct device *dev)
 	struct nvme_ctrl *ctrl =
 		container_of(dev, struct nvme_ctrl, ctrl_device);
 	struct nvme_subsystem *subsys = ctrl->subsys;
-	struct nvme_cel *cel, *next;
 
 	if (!subsys || ctrl->instance != subsys->instance)
 		ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 
-	list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
-		list_del(&cel->entry);
-		kfree(cel);
-	}
+	xa_destroy(&ctrl->cels);
 
 	nvme_mpath_uninit(ctrl);
 	__free_page(ctrl->discard_page);
@@ -4488,7 +4465,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	spin_lock_init(&ctrl->lock);
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
-	INIT_LIST_HEAD(&ctrl->cels);
+	xa_init(&ctrl->cels);
 	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4749baa457bc..1096ef1f6aa2 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -300,7 +300,7 @@ struct nvme_ctrl {
 	unsigned long quirks;
 	struct nvme_id_power_state psd[32];
 	struct nvme_effects_log *effects;
-	struct list_head cels;
+	struct xarray cels;
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
-- 
cgit v1.2.3


From 0b85f59d30b91bd2b93ea7ef0816a4b7e7039e8c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 18 Aug 2020 11:35:30 +0300
Subject: nvme-pci: Move enumeration by class to be last in the table

It's unusual that we have enumeration by class in the middle of the table.
It might potentially be problematic in the future if we add another entry
after it.

So, move class matching entry to be the last in the ID table.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 08c5a9b438f3..dabe42f605b1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3187,7 +3187,6 @@ static const struct pci_device_id nvme_id_table[] = {
 				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
 	{ PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
 		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
-	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
 		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
@@ -3195,6 +3194,8 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
 				NVME_QUIRK_128_BYTES_SQES |
 				NVME_QUIRK_SHARED_TAGS },
+
+	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, nvme_id_table);
-- 
cgit v1.2.3


From f7790e5de92a3b88e74704a6cb2276c09c32f511 Mon Sep 17 00:00:00 2001
From: Mark Wunderlich <mark.wunderlich@intel.com>
Date: Fri, 28 Aug 2020 01:00:53 +0000
Subject: nvmet-tcp: have queue io_work context run on sock incoming cpu

No real good need to spread queues artificially. Usually the
target will serve multiple hosts, and it's better to run on the socket
incoming cpu for better affinitization rather than spread queues on all
online cpus.

We rely on RSS to spread the work around sufficiently.

Signed-off-by: Mark Wunderlich <mark.wunderlich@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/tcp.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 8e0d766d2722..dc1f0f647189 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -94,7 +94,6 @@ struct nvmet_tcp_queue {
 	struct socket		*sock;
 	struct nvmet_tcp_port	*port;
 	struct work_struct	io_work;
-	int			cpu;
 	struct nvmet_cq		nvme_cq;
 	struct nvmet_sq		nvme_sq;
 
@@ -144,7 +143,6 @@ struct nvmet_tcp_port {
 	struct work_struct	accept_work;
 	struct nvmet_port	*nport;
 	struct sockaddr_storage addr;
-	int			last_cpu;
 	void (*data_ready)(struct sock *);
 };
 
@@ -219,6 +217,11 @@ static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
 	list_add_tail(&cmd->entry, &cmd->queue->free_list);
 }
 
+static inline int queue_cpu(struct nvmet_tcp_queue *queue)
+{
+	return queue->sock->sk->sk_incoming_cpu;
+}
+
 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
 {
 	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
@@ -506,7 +509,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
 	struct nvmet_tcp_queue	*queue = cmd->queue;
 
 	llist_add(&cmd->lentry, &queue->resp_list);
-	queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
+	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
 }
 
 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
@@ -1223,7 +1226,7 @@ static void nvmet_tcp_io_work(struct work_struct *w)
 	 * We exahusted our budget, requeue our selves
 	 */
 	if (pending)
-		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
 }
 
 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
@@ -1383,7 +1386,7 @@ static void nvmet_tcp_data_ready(struct sock *sk)
 	read_lock_bh(&sk->sk_callback_lock);
 	queue = sk->sk_user_data;
 	if (likely(queue))
-		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -1403,7 +1406,7 @@ static void nvmet_tcp_write_space(struct sock *sk)
 
 	if (sk_stream_is_writeable(sk)) {
 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
 	}
 out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -1512,9 +1515,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	if (ret)
 		goto out_free_connect;
 
-	port->last_cpu = cpumask_next_wrap(port->last_cpu,
-				cpu_online_mask, -1, false);
-	queue->cpu = port->last_cpu;
 	nvmet_prepare_receive_pdu(queue);
 
 	mutex_lock(&nvmet_tcp_queue_mutex);
@@ -1525,7 +1525,7 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	if (ret)
 		goto out_destroy_sq;
 
-	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
 
 	return 0;
 out_destroy_sq:
@@ -1612,7 +1612,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 	}
 
 	port->nport = nport;
-	port->last_cpu = -1;
 	INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
 	if (port->nport->inline_data_size < 0)
 		port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
-- 
cgit v1.2.3


From 4e683c48db018e72e81e1e0d12a5020d29be561a Mon Sep 17 00:00:00 2001
From: Amit Engel <amit.engel@dell.com>
Date: Wed, 16 Sep 2020 20:47:20 +0300
Subject: nvmet: handle keep-alive timer when kato is modified by a set
 features cmd

A user may modify the kato by a set features cmd.  To properly deal
with races or a kato value of 0 (no keep alive enabled) change
nvmet_set_feat_kato to first disable the timer, then set the value
and then re-enable the timer.

Signed-off-by: Amit Engel <amit.engel@dell.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 2 ++
 drivers/nvme/target/core.c      | 4 ++--
 drivers/nvme/target/nvmet.h     | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index e9fe91786bbb..dca34489a1dc 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -727,7 +727,9 @@ u16 nvmet_set_feat_kato(struct nvmet_req *req)
 {
 	u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
 
+	nvmet_stop_keep_alive_timer(req->sq->ctrl);
 	req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
+	nvmet_start_keep_alive_timer(req->sq->ctrl);
 
 	nvmet_set_result(req, req->sq->ctrl->kato);
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b7b63330b5ef..25d62d867563 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -395,7 +395,7 @@ static void nvmet_keep_alive_timer(struct work_struct *work)
 	nvmet_ctrl_fatal_error(ctrl);
 }
 
-static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
+void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
 {
 	if (unlikely(ctrl->kato == 0))
 		return;
@@ -407,7 +407,7 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 }
 
-static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
+void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 {
 	if (unlikely(ctrl->kato == 0))
 		return;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 47ee3fb193bd..559a15ccc322 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -395,6 +395,8 @@ void nvmet_get_feat_async_event(struct nvmet_req *req);
 u16 nvmet_set_feat_kato(struct nvmet_req *req);
 u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask);
 void nvmet_execute_async_event(struct nvmet_req *req);
+void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl);
+void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl);
 
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
 void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id);
-- 
cgit v1.2.3


From 5b3356d9da88fe6d6f3bcd8a2336d14a767f377d Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 22 Sep 2020 14:17:18 -0700
Subject: nvmet: add passthru ZNS support

In the default passthru implementation NVMeOF target passthru ctrl is
not capable of handling Zoned Namespaces (ZNS).

Update the nvmet_parse_pasthru_admin_cmd() to allow
NVME_ID_CNS_CS_CTRL/NVME_CIS_ZNS and NVME_ID_CNS_CS_NS/NVME_CIS_ZNS.

With this addition NVMeOF Passthru allows Zoned Namespaces.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/passthru.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index e3a5f8499a0b..56c571052216 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -456,10 +456,26 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
 			req->execute = nvmet_passthru_execute_cmd;
 			req->p.use_workqueue = true;
 			return NVME_SC_SUCCESS;
+		case NVME_ID_CNS_CS_CTRL:
+			switch (req->cmd->identify.csi) {
+			case NVME_CSI_ZNS:
+				req->execute = nvmet_passthru_execute_cmd;
+				req->p.use_workqueue = true;
+				return NVME_SC_SUCCESS;
+			}
+			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		case NVME_ID_CNS_NS:
 			req->execute = nvmet_passthru_execute_cmd;
 			req->p.use_workqueue = true;
 			return NVME_SC_SUCCESS;
+		case NVME_ID_CNS_CS_NS:
+			switch (req->cmd->identify.csi) {
+			case NVME_CSI_ZNS:
+				req->execute = nvmet_passthru_execute_cmd;
+				req->p.use_workqueue = true;
+				return NVME_SC_SUCCESS;
+			}
+			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		default:
 			return nvmet_setup_passthru_command(req);
 		}
-- 
cgit v1.2.3


From ddd3d1051797b9f907ab9799f5ba50398c530676 Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Tue, 22 Sep 2020 14:54:12 -0700
Subject: nvmet-fc: fix missing check for no hostport struct

A hostport port pointer is allowed to be NULL as it is not allocated if
the lldd does not support the new interfaces for NVME LS request support.
The hostport free routine validates the handle but forgot to validate the
hostport pointer.

Validate the hostport pointer before using it to validate the handle.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index e6861cc10e7d..cd4e73aa9807 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1019,7 +1019,7 @@ static void
 nvmet_fc_free_hostport(struct nvmet_fc_hostport *hostport)
 {
 	/* if LLDD not implemented, leave as NULL */
-	if (!hostport->hosthandle)
+	if (!hostport || !hostport->hosthandle)
 		return;
 
 	nvmet_fc_hostport_put(hostport);
-- 
cgit v1.2.3


From 936fab503ff4af94f5f9c0b549f3ab4d435500ec Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 30 Aug 2020 12:00:10 +0200
Subject: nvme: fix error handling in nvme_ns_report_zones

nvme_submit_sync_cmd can return positive NVMe error codes in addition to
the negative Linux error code, which are currently ignored.  Fix this
by removing __nvme_ns_report_zones and handling the errors from
nvme_submit_sync_cmd in the caller instead of multiplexing the return
value and the number of zones reported into a single return value.

Fixes: 240e6ee272c0 ("nvme: support for zoned namespaces")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/zns.c | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 57cfd78731fb..53efecb67898 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -133,28 +133,6 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
 	return NULL;
 }
 
-static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
-				  struct nvme_zone_report *report,
-				  size_t buflen)
-{
-	struct nvme_command c = { };
-	int ret;
-
-	c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
-	c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
-	c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
-	c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
-	c.zmr.zra = NVME_ZRA_ZONE_REPORT;
-	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
-	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
-
-	ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
-	if (ret)
-		return ret;
-
-	return le64_to_cpu(report->nr_zones);
-}
-
 static int nvme_zone_parse_entry(struct nvme_ns *ns,
 				 struct nvme_zone_descriptor *entry,
 				 unsigned int idx, report_zones_cb cb,
@@ -182,6 +160,7 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 			unsigned int nr_zones, report_zones_cb cb, void *data)
 {
 	struct nvme_zone_report *report;
+	struct nvme_command c = { };
 	int ret, zone_idx = 0;
 	unsigned int nz, i;
 	size_t buflen;
@@ -190,14 +169,26 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 	if (!report)
 		return -ENOMEM;
 
+	c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
+	c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
+	c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
+	c.zmr.zra = NVME_ZRA_ZONE_REPORT;
+	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
+	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
+
 	sector &= ~(ns->zsze - 1);
 	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
 		memset(report, 0, buflen);
-		ret = __nvme_ns_report_zones(ns, sector, report, buflen);
-		if (ret < 0)
+
+		c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
+		ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
+		if (ret) {
+			if (ret > 0)
+				ret = -EIO;
 			goto out_free;
+		}
 
-		nz = min_t(unsigned int, ret, nr_zones);
+		nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones);
 		if (!nz)
 			break;
 
-- 
cgit v1.2.3


From 21cc2f3f799fa228f812f7ab971fa8d43c893392 Mon Sep 17 00:00:00 2001
From: Jeffle Xu <jefflexu@linux.alibaba.com>
Date: Thu, 24 Sep 2020 09:01:22 +0200
Subject: nvme-pci: allocate separate interrupt for the reserved non-polled I/O
 queue

One queue will be reserved for non-polled IO when nvme.poll_queues is
greater or equal than the number of IO queues that the nvme controller
can provide. Currently the reserved queue for non-polled IO will reuse
the interrupt used by admin queue in this case, e.g, vector 0.

This can work and the performance may not be an issue since the admin
queue is used unfrequently. However this behaviour may be inconsistent
with that when nvme.poll_queues is smaller than the number of IO
queues available.

Thus allocate separate interrupt for this reserved queue, and thus make
the behaviour consistent.

Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
[hch: minor cleanups, mostly to the pre-existing surrounding code]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index dabe42f605b1..cb645ac0c2dc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2038,32 +2038,30 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 		.calc_sets	= nvme_calc_irq_sets,
 		.priv		= dev,
 	};
-	unsigned int irq_queues, this_p_queues;
+	unsigned int irq_queues, poll_queues;
 
 	/*
-	 * Poll queues don't need interrupts, but we need at least one IO
-	 * queue left over for non-polled IO.
+	 * Poll queues don't need interrupts, but we need at least one I/O queue
+	 * left over for non-polled I/O.
 	 */
-	this_p_queues = dev->nr_poll_queues;
-	if (this_p_queues >= nr_io_queues) {
-		this_p_queues = nr_io_queues - 1;
-		irq_queues = 1;
-	} else {
-		irq_queues = nr_io_queues - this_p_queues + 1;
-	}
-	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
+	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
+	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
 
-	/* Initialize for the single interrupt case */
+	/*
+	 * Initialize for the single interrupt case, will be updated in
+	 * nvme_calc_irq_sets().
+	 */
 	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 	dev->io_queues[HCTX_TYPE_READ] = 0;
 
 	/*
-	 * Some Apple controllers require all queues to use the
-	 * first vector.
+	 * We need interrupts for the admin queue and each non-polled I/O queue,
+	 * but some Apple controllers require all queues to use the first
+	 * vector.
 	 */
-	if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
-		irq_queues = 1;
-
+	irq_queues = 1;
+	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
+		irq_queues += (nr_io_queues - poll_queues);
 	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
 			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 }
-- 
cgit v1.2.3


From dc4d137ee3b79a7474b747b4b326d472ccc2cb79 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 28 Aug 2020 12:54:00 +0200
Subject: null_blk: add support for max open/active zone limit for zoned
 devices

Add support for user space to set a max open zone and a max active zone
limit via configfs. By default, the default values are 0 == no limit.

Call the block layer API functions used for exposing the configured
limits to sysfs.

Add accounting in null_blk_zoned so that these new limits are respected.
Performing an operation that would exceed these limits results in a
standard I/O error.

A max open zone limit exists in the ZBC standard.
While null_blk_zoned is used to test the Zoned Block Device model in
Linux, when it comes to differences between ZBC and ZNS, null_blk_zoned
mostly follows ZBC.

Therefore, implement the manage open zone resources function from ZBC,
but additionally add support for max active zones.
This enables user space not only to test against a device with an open
zone limit, but also to test against a device with an active zone limit.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk.h       |   5 +
 drivers/block/null_blk_main.c  |  16 ++-
 drivers/block/null_blk_zoned.c | 319 +++++++++++++++++++++++++++++++++--------
 3 files changed, 282 insertions(+), 58 deletions(-)

diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index daed4a9c3436..d2e7db43a52a 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -42,6 +42,9 @@ struct nullb_device {
 	struct badblocks badblocks;
 
 	unsigned int nr_zones;
+	unsigned int nr_zones_imp_open;
+	unsigned int nr_zones_exp_open;
+	unsigned int nr_zones_closed;
 	struct blk_zone *zones;
 	sector_t zone_size_sects;
 
@@ -51,6 +54,8 @@ struct nullb_device {
 	unsigned long zone_size; /* zone size in MB if device is zoned */
 	unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
 	unsigned int zone_nr_conv; /* number of conventional zones */
+	unsigned int zone_max_open; /* max number of open zones */
+	unsigned int zone_max_active; /* max number of active zones */
 	unsigned int submit_queues; /* number of submission queues */
 	unsigned int home_node; /* home node for the device */
 	unsigned int queue_mode; /* block interface */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 55d2ef92be47..4685ea401d5b 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -212,6 +212,14 @@ static unsigned int g_zone_nr_conv;
 module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
 MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
 
+static unsigned int g_zone_max_open;
+module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
+MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
+
+static unsigned int g_zone_max_active;
+module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
+MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
+
 static struct nullb_device *null_alloc_dev(void);
 static void null_free_dev(struct nullb_device *dev);
 static void null_del_dev(struct nullb *nullb);
@@ -351,6 +359,8 @@ NULLB_DEVICE_ATTR(zoned, bool, NULL);
 NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
 NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
 NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
+NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
+NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -468,6 +478,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
 	&nullb_device_attr_zone_size,
 	&nullb_device_attr_zone_capacity,
 	&nullb_device_attr_zone_nr_conv,
+	&nullb_device_attr_zone_max_open,
+	&nullb_device_attr_zone_max_active,
 	NULL,
 };
 
@@ -521,7 +533,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
 static ssize_t memb_group_features_show(struct config_item *item, char *page)
 {
 	return snprintf(page, PAGE_SIZE,
-			"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
+			"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n");
 }
 
 CONFIGFS_ATTR_RO(memb_group_, features);
@@ -584,6 +596,8 @@ static struct nullb_device *null_alloc_dev(void)
 	dev->zone_size = g_zone_size;
 	dev->zone_capacity = g_zone_capacity;
 	dev->zone_nr_conv = g_zone_nr_conv;
+	dev->zone_max_open = g_zone_max_open;
+	dev->zone_max_active = g_zone_max_active;
 	return dev;
 }
 
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index 3d25c9ad2383..fa0cc70f05e6 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -51,6 +51,22 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
 			dev->zone_nr_conv);
 	}
 
+	/* Max active zones has to be < nbr of seq zones in order to be enforceable */
+	if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) {
+		dev->zone_max_active = 0;
+		pr_info("zone_max_active limit disabled, limit >= zone count\n");
+	}
+
+	/* Max open zones has to be <= max active zones */
+	if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) {
+		dev->zone_max_open = dev->zone_max_active;
+		pr_info("changed the maximum number of open zones to %u\n",
+			dev->nr_zones);
+	} else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) {
+		dev->zone_max_open = 0;
+		pr_info("zone_max_open limit disabled, limit >= zone count\n");
+	}
+
 	for (i = 0; i <  dev->zone_nr_conv; i++) {
 		struct blk_zone *zone = &dev->zones[i];
 
@@ -99,6 +115,8 @@ int null_register_zoned_dev(struct nullb *nullb)
 	}
 
 	blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
+	blk_queue_max_open_zones(q, dev->zone_max_open);
+	blk_queue_max_active_zones(q, dev->zone_max_active);
 
 	return 0;
 }
@@ -159,6 +177,103 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
 	return (zone->wp - sector) << SECTOR_SHIFT;
 }
 
+static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone)
+{
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_CLOSED:
+		/* close operation on closed is not an error */
+		return BLK_STS_OK;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
+	case BLK_ZONE_COND_EXP_OPEN:
+		dev->nr_zones_exp_open--;
+		break;
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_FULL:
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	if (zone->wp == zone->start) {
+		zone->cond = BLK_ZONE_COND_EMPTY;
+	} else {
+		zone->cond = BLK_ZONE_COND_CLOSED;
+		dev->nr_zones_closed++;
+	}
+
+	return BLK_STS_OK;
+}
+
+static void null_close_first_imp_zone(struct nullb_device *dev)
+{
+	unsigned int i;
+
+	for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
+		if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) {
+			null_close_zone(dev, &dev->zones[i]);
+			return;
+		}
+	}
+}
+
+static bool null_can_set_active(struct nullb_device *dev)
+{
+	if (!dev->zone_max_active)
+		return true;
+
+	return dev->nr_zones_exp_open + dev->nr_zones_imp_open +
+	       dev->nr_zones_closed < dev->zone_max_active;
+}
+
+static bool null_can_open(struct nullb_device *dev)
+{
+	if (!dev->zone_max_open)
+		return true;
+
+	if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open)
+		return true;
+
+	if (dev->nr_zones_imp_open && null_can_set_active(dev)) {
+		null_close_first_imp_zone(dev);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * This function matches the manage open zone resources function in the ZBC standard,
+ * with the addition of max active zones support (added in the ZNS standard).
+ *
+ * The function determines if a zone can transition to implicit open or explicit open,
+ * while maintaining the max open zone (and max active zone) limit(s). It may close an
+ * implicit open zone in order to make additional zone resources available.
+ *
+ * ZBC states that an implicit open zone shall be closed only if there is not
+ * room within the open limit. However, with the addition of an active limit,
+ * it is not certain that closing an implicit open zone will allow a new zone
+ * to be opened, since we might already be at the active limit capacity.
+ */
+static bool null_has_zone_resources(struct nullb_device *dev, struct blk_zone *zone)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EMPTY:
+		if (!null_can_set_active(dev))
+			return false;
+		fallthrough;
+	case BLK_ZONE_COND_CLOSED:
+		return null_can_open(dev);
+	default:
+		/* Should never be called for other states */
+		WARN_ON(1);
+		return false;
+	}
+}
+
 static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 				    unsigned int nr_sectors, bool append)
 {
@@ -177,43 +292,155 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 		/* Cannot write to a full zone */
 		return BLK_STS_IOERR;
 	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_CLOSED:
+		if (!null_has_zone_resources(dev, zone))
+			return BLK_STS_IOERR;
+		break;
 	case BLK_ZONE_COND_IMP_OPEN:
 	case BLK_ZONE_COND_EXP_OPEN:
+		break;
+	default:
+		/* Invalid zone condition */
+		return BLK_STS_IOERR;
+	}
+
+	/*
+	 * Regular writes must be at the write pointer position.
+	 * Zone append writes are automatically issued at the write
+	 * pointer and the position returned using the request or BIO
+	 * sector.
+	 */
+	if (append) {
+		sector = zone->wp;
+		if (cmd->bio)
+			cmd->bio->bi_iter.bi_sector = sector;
+		else
+			cmd->rq->__sector = sector;
+	} else if (sector != zone->wp) {
+		return BLK_STS_IOERR;
+	}
+
+	if (zone->wp + nr_sectors > zone->start + zone->capacity)
+		return BLK_STS_IOERR;
+
+	if (zone->cond == BLK_ZONE_COND_CLOSED) {
+		dev->nr_zones_closed--;
+		dev->nr_zones_imp_open++;
+	} else if (zone->cond == BLK_ZONE_COND_EMPTY) {
+		dev->nr_zones_imp_open++;
+	}
+	if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
+		zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+	ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
+	if (ret != BLK_STS_OK)
+		return ret;
+
+	zone->wp += nr_sectors;
+	if (zone->wp == zone->start + zone->capacity) {
+		if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
+			dev->nr_zones_exp_open--;
+		else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
+			dev->nr_zones_imp_open--;
+		zone->cond = BLK_ZONE_COND_FULL;
+	}
+	return BLK_STS_OK;
+}
+
+static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone)
+{
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EXP_OPEN:
+		/* open operation on exp open is not an error */
+		return BLK_STS_OK;
+	case BLK_ZONE_COND_EMPTY:
+		if (!null_has_zone_resources(dev, zone))
+			return BLK_STS_IOERR;
+		break;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
 	case BLK_ZONE_COND_CLOSED:
-		/*
-		 * Regular writes must be at the write pointer position.
-		 * Zone append writes are automatically issued at the write
-		 * pointer and the position returned using the request or BIO
-		 * sector.
-		 */
-		if (append) {
-			sector = zone->wp;
-			if (cmd->bio)
-				cmd->bio->bi_iter.bi_sector = sector;
-			else
-				cmd->rq->__sector = sector;
-		} else if (sector != zone->wp) {
+		if (!null_has_zone_resources(dev, zone))
 			return BLK_STS_IOERR;
-		}
+		dev->nr_zones_closed--;
+		break;
+	case BLK_ZONE_COND_FULL:
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	zone->cond = BLK_ZONE_COND_EXP_OPEN;
+	dev->nr_zones_exp_open++;
 
-		if (zone->wp + nr_sectors > zone->start + zone->capacity)
+	return BLK_STS_OK;
+}
+
+static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone)
+{
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_FULL:
+		/* finish operation on full is not an error */
+		return BLK_STS_OK;
+	case BLK_ZONE_COND_EMPTY:
+		if (!null_has_zone_resources(dev, zone))
 			return BLK_STS_IOERR;
+		break;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
+	case BLK_ZONE_COND_EXP_OPEN:
+		dev->nr_zones_exp_open--;
+		break;
+	case BLK_ZONE_COND_CLOSED:
+		if (!null_has_zone_resources(dev, zone))
+			return BLK_STS_IOERR;
+		dev->nr_zones_closed--;
+		break;
+	default:
+		return BLK_STS_IOERR;
+	}
 
-		if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
-			zone->cond = BLK_ZONE_COND_IMP_OPEN;
+	zone->cond = BLK_ZONE_COND_FULL;
+	zone->wp = zone->start + zone->len;
 
-		ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
-		if (ret != BLK_STS_OK)
-			return ret;
+	return BLK_STS_OK;
+}
+
+static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone)
+{
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
 
-		zone->wp += nr_sectors;
-		if (zone->wp == zone->start + zone->capacity)
-			zone->cond = BLK_ZONE_COND_FULL;
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EMPTY:
+		/* reset operation on empty is not an error */
 		return BLK_STS_OK;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
+	case BLK_ZONE_COND_EXP_OPEN:
+		dev->nr_zones_exp_open--;
+		break;
+	case BLK_ZONE_COND_CLOSED:
+		dev->nr_zones_closed--;
+		break;
+	case BLK_ZONE_COND_FULL:
+		break;
 	default:
-		/* Invalid zone condition */
 		return BLK_STS_IOERR;
 	}
+
+	zone->cond = BLK_ZONE_COND_EMPTY;
+	zone->wp = zone->start;
+
+	return BLK_STS_OK;
 }
 
 static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
@@ -222,56 +449,34 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
 	struct nullb_device *dev = cmd->nq->dev;
 	unsigned int zone_no = null_zone_no(dev, sector);
 	struct blk_zone *zone = &dev->zones[zone_no];
+	blk_status_t ret = BLK_STS_OK;
 	size_t i;
 
 	switch (op) {
 	case REQ_OP_ZONE_RESET_ALL:
-		for (i = 0; i < dev->nr_zones; i++) {
-			if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
-				continue;
-			zone[i].cond = BLK_ZONE_COND_EMPTY;
-			zone[i].wp = zone[i].start;
-		}
+		for (i = dev->zone_nr_conv; i < dev->nr_zones; i++)
+			null_reset_zone(dev, &dev->zones[i]);
 		break;
 	case REQ_OP_ZONE_RESET:
-		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
-			return BLK_STS_IOERR;
-
-		zone->cond = BLK_ZONE_COND_EMPTY;
-		zone->wp = zone->start;
+		ret = null_reset_zone(dev, zone);
 		break;
 	case REQ_OP_ZONE_OPEN:
-		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
-			return BLK_STS_IOERR;
-		if (zone->cond == BLK_ZONE_COND_FULL)
-			return BLK_STS_IOERR;
-
-		zone->cond = BLK_ZONE_COND_EXP_OPEN;
+		ret = null_open_zone(dev, zone);
 		break;
 	case REQ_OP_ZONE_CLOSE:
-		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
-			return BLK_STS_IOERR;
-		if (zone->cond == BLK_ZONE_COND_FULL)
-			return BLK_STS_IOERR;
-
-		if (zone->wp == zone->start)
-			zone->cond = BLK_ZONE_COND_EMPTY;
-		else
-			zone->cond = BLK_ZONE_COND_CLOSED;
+		ret = null_close_zone(dev, zone);
 		break;
 	case REQ_OP_ZONE_FINISH:
-		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
-			return BLK_STS_IOERR;
-
-		zone->cond = BLK_ZONE_COND_FULL;
-		zone->wp = zone->start + zone->len;
+		ret = null_finish_zone(dev, zone);
 		break;
 	default:
 		return BLK_STS_NOTSUPP;
 	}
 
-	trace_nullb_zone_op(cmd, zone_no, zone->cond);
-	return BLK_STS_OK;
+	if (ret == BLK_STS_OK)
+		trace_nullb_zone_op(cmd, zone_no, zone->cond);
+
+	return ret;
 }
 
 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
-- 
cgit v1.2.3


From a58e88bfdc4d527f99827b8d66ecf68d05c3ac87 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:42 +0800
Subject: bcache: share register sysfs with async register

Previously the experimental async registration uses a separate sysfs
file register_async. Now the async registration code seems working well
for a while, we can do furtuher testing with it now.

This patch changes the async bcache registration shares the same sysfs
file /sys/fs/bcache/register (and register_quiet). Async registration
will be default behavior if BCACHE_ASYNC_REGISTRATION is set in kernel
configure. By default, BCACHE_ASYNC_REGISTRATION is not configured yet.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 6bfa77167362..d34495d27450 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2448,7 +2448,6 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
 
 kobj_attribute_write(register,		register_bcache);
 kobj_attribute_write(register_quiet,	register_bcache);
-kobj_attribute_write(register_async,	register_bcache);
 kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
 
 static bool bch_is_open_backing(struct block_device *bdev)
@@ -2571,6 +2570,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	struct cache_sb_disk *sb_disk;
 	struct block_device *bdev;
 	ssize_t ret;
+	bool async_registration = false;
+
+#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
+	async_registration = true;
+#endif
 
 	ret = -EBUSY;
 	err = "failed to reference bcache module";
@@ -2624,7 +2628,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		goto out_blkdev_put;
 
 	err = "failed to register device";
-	if (attr == &ksysfs_register_async) {
+
+	if (async_registration) {
 		/* register in asynchronous way */
 		struct async_reg_args *args =
 			kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
@@ -2887,9 +2892,6 @@ static int __init bcache_init(void)
 	static const struct attribute *files[] = {
 		&ksysfs_register.attr,
 		&ksysfs_register_quiet.attr,
-#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
-		&ksysfs_register_async.attr,
-#endif
 		&ksysfs_pendings_cleanup.attr,
 		NULL
 	};
-- 
cgit v1.2.3


From 7e59c506c338d1e42ed19f24f1a737aff8b18fc3 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <dongsheng.yang@easystack.cn>
Date: Thu, 1 Oct 2020 14:50:43 +0800
Subject: bcache: check c->root with IS_ERR_OR_NULL() in mca_reserve()

In mca_reserve(c) macro, we are checking root whether is NULL or not.
But that's not enough, when we read the root node in run_cache_set(),
if we got an error in bch_btree_node_read_done(), we will return
ERR_PTR(-EIO) to c->root.

And then we will go continue to unregister, but before calling
unregister_shrinker(&c->shrink), there is a possibility to call
bch_mca_count(), and we would get a crash with call trace like that:

[ 2149.876008] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000b5
... ...
[ 2150.598931] Call trace:
[ 2150.606439]  bch_mca_count+0x58/0x98 [escache]
[ 2150.615866]  do_shrink_slab+0x54/0x310
[ 2150.624429]  shrink_slab+0x248/0x2d0
[ 2150.632633]  drop_slab_node+0x54/0x88
[ 2150.640746]  drop_slab+0x50/0x88
[ 2150.648228]  drop_caches_sysctl_handler+0xf0/0x118
[ 2150.657219]  proc_sys_call_handler.isra.18+0xb8/0x110
[ 2150.666342]  proc_sys_write+0x40/0x50
[ 2150.673889]  __vfs_write+0x48/0x90
[ 2150.681095]  vfs_write+0xac/0x1b8
[ 2150.688145]  ksys_write+0x6c/0xd0
[ 2150.695127]  __arm64_sys_write+0x24/0x30
[ 2150.702749]  el0_svc_handler+0xa0/0x128
[ 2150.710296]  el0_svc+0x8/0xc

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3d8bd0692af3..ae7611fa42bf 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -514,7 +514,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
  * mca -> memory cache
  */
 
-#define mca_reserve(c)	(((c->root && c->root->level)		\
+#define mca_reserve(c)	(((!IS_ERR_OR_NULL(c->root) && c->root->level) \
 			  ? c->root->level : 1) * 8 + 16)
 #define mca_can_free(c)						\
 	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
-- 
cgit v1.2.3


From 84e5d1363cd8128820434cdb983ecab0f5c1b078 Mon Sep 17 00:00:00 2001
From: Qinglang Miao <miaoqinglang@huawei.com>
Date: Thu, 1 Oct 2020 14:50:44 +0800
Subject: bcache: Convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

As inode->iprivate equals to third parameter of
debugfs_create_file() which is NULL. So it's equivalent
to original code logic.

Signed-off-by: Qinglang Miao <miaoqinglang@huawei.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/closure.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 0164a1fe94a9..d8d9394a6beb 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -159,7 +159,7 @@ void closure_debug_destroy(struct closure *cl)
 
 static struct dentry *closure_debug;
 
-static int debug_seq_show(struct seq_file *f, void *data)
+static int debug_show(struct seq_file *f, void *data)
 {
 	struct closure *cl;
 
@@ -188,17 +188,7 @@ static int debug_seq_show(struct seq_file *f, void *data)
 	return 0;
 }
 
-static int debug_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, debug_seq_show, NULL);
-}
-
-static const struct file_operations debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= debug_seq_open,
-	.read		= seq_read,
-	.release	= single_release
-};
+DEFINE_SHOW_ATTRIBUTE(debug);
 
 void  __init closure_debug_init(void)
 {
@@ -209,7 +199,7 @@ void  __init closure_debug_init(void)
 		 * about this.
 		 */
 		closure_debug = debugfs_create_file(
-			"closures", 0400, bcache_debug, NULL, &debug_ops);
+			"closures", 0400, bcache_debug, NULL, &debug_fops);
 }
 #endif
 
-- 
cgit v1.2.3


From 17e4aed8309ff28670271546c2c3263eb12f5eb6 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:45 +0800
Subject: bcache: remove 'int n' from parameter list of bch_bucket_alloc_set()

The parameter 'int n' from bch_bucket_alloc_set() is not cleared
defined. From the code comments n is the number of buckets to alloc, but
from the code itself 'n' is the maximum cache to iterate. Indeed all the
locations where bch_bucket_alloc_set() is called, 'n' is alwasy 1.

This patch removes the confused and unnecessary 'int n' from parameter
list of  bch_bucket_alloc_set(), and explicitly allocates only 1 bucket
for its caller.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c  | 35 +++++++++++++++--------------------
 drivers/md/bcache/bcache.h |  4 ++--
 drivers/md/bcache/btree.c  |  2 +-
 drivers/md/bcache/super.c  |  2 +-
 4 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 52035a78d836..4493ff57476d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -49,7 +49,7 @@
  *
  * bch_bucket_alloc() allocates a single bucket from a specific cache.
  *
- * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * bch_bucket_alloc_set() allocates one  bucket from different caches
  * out of a cache set.
  *
  * free_some_buckets() drives all the processes described above. It's called
@@ -488,34 +488,29 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
-			   struct bkey *k, int n, bool wait)
+			   struct bkey *k, bool wait)
 {
-	int i;
+	struct cache *ca;
+	long b;
 
 	/* No allocation if CACHE_SET_IO_DISABLE bit is set */
 	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
 		return -1;
 
 	lockdep_assert_held(&c->bucket_lock);
-	BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET);
 
 	bkey_init(k);
 
-	/* sort by free space/prio of oldest data in caches */
-
-	for (i = 0; i < n; i++) {
-		struct cache *ca = c->cache_by_alloc[i];
-		long b = bch_bucket_alloc(ca, reserve, wait);
+	ca = c->cache_by_alloc[0];
+	b = bch_bucket_alloc(ca, reserve, wait);
+	if (b == -1)
+		goto err;
 
-		if (b == -1)
-			goto err;
+	k->ptr[0] = MAKE_PTR(ca->buckets[b].gen,
+			     bucket_to_sector(c, b),
+			     ca->sb.nr_this_dev);
 
-		k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
-				bucket_to_sector(c, b),
-				ca->sb.nr_this_dev);
-
-		SET_KEY_PTRS(k, i + 1);
-	}
+	SET_KEY_PTRS(k, 1);
 
 	return 0;
 err:
@@ -525,12 +520,12 @@ err:
 }
 
 int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
-			 struct bkey *k, int n, bool wait)
+			 struct bkey *k, bool wait)
 {
 	int ret;
 
 	mutex_lock(&c->bucket_lock);
-	ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
+	ret = __bch_bucket_alloc_set(c, reserve, k, wait);
 	mutex_unlock(&c->bucket_lock);
 	return ret;
 }
@@ -638,7 +633,7 @@ bool bch_alloc_sectors(struct cache_set *c,
 
 		spin_unlock(&c->data_bucket_lock);
 
-		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
+		if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait))
 			return false;
 
 		spin_lock(&c->data_bucket_lock);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4fd03d2496d8..5ff6e9573935 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -994,9 +994,9 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k);
 
 long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait);
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
-			   struct bkey *k, int n, bool wait);
+			   struct bkey *k, bool wait);
 int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
-			 struct bkey *k, int n, bool wait);
+			 struct bkey *k, bool wait);
 bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
 		       unsigned int sectors, unsigned int write_point,
 		       unsigned int write_prio, bool wait);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ae7611fa42bf..e2bd03408e66 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1091,7 +1091,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
+	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
 		goto err;
 
 	bkey_put(c, &k.key);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index d34495d27450..b4176b8e9571 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -486,7 +486,7 @@ static int __uuid_write(struct cache_set *c)
 	closure_init_stack(&cl);
 	lockdep_assert_held(&bch_register_lock);
 
-	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
+	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
 		return 1;
 
 	size =  meta_bucket_pages(&c->sb) * PAGE_SECTORS;
-- 
cgit v1.2.3


From 697e23495c94f0380c1ed8b11f830b92b64c99ea Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:46 +0800
Subject: bcache: explicitly make cache_set only have single cache

Currently although the bcache code has a framework for multiple caches
in a cache set, but indeed the multiple caches never completed and users
use md raid1 for multiple copies of the cached data.

This patch does the following change in struct cache_set, to explicitly
make a cache_set only have single cache,
- Change pointer array "*cache[MAX_CACHES_PER_SET]" to a single pointer
  "*cache".
- Remove pointer array "*cache_by_alloc[MAX_CACHES_PER_SET]".
- Remove "caches_loaded".

Now the code looks as exactly what it does in practic: only one cache is
used in the cache set.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c  |  2 +-
 drivers/md/bcache/bcache.h |  8 +++-----
 drivers/md/bcache/super.c  | 19 ++++++++-----------
 3 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4493ff57476d..3385f6add6df 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -501,7 +501,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
 
 	bkey_init(k);
 
-	ca = c->cache_by_alloc[0];
+	ca = c->cache;
 	b = bch_bucket_alloc(ca, reserve, wait);
 	if (b == -1)
 		goto err;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5ff6e9573935..aa112c1adba1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -519,9 +519,7 @@ struct cache_set {
 
 	struct cache_sb		sb;
 
-	struct cache		*cache[MAX_CACHES_PER_SET];
-	struct cache		*cache_by_alloc[MAX_CACHES_PER_SET];
-	int			caches_loaded;
+	struct cache		*cache;
 
 	struct bcache_device	**devices;
 	unsigned int		devices_max_used;
@@ -808,7 +806,7 @@ static inline struct cache *PTR_CACHE(struct cache_set *c,
 				      const struct bkey *k,
 				      unsigned int ptr)
 {
-	return c->cache[PTR_DEV(k, ptr)];
+	return c->cache;
 }
 
 static inline size_t PTR_BUCKET_NR(struct cache_set *c,
@@ -890,7 +888,7 @@ do {									\
 /* Looping macros */
 
 #define for_each_cache(ca, cs, iter)					\
-	for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
+	for (iter = 0; ca = cs->cache, iter < 1; iter++)
 
 #define for_each_bucket(b, ca)						\
 	for (b = (ca)->buckets + (ca)->sb.first_bucket;			\
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b4176b8e9571..739cf292554c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1674,7 +1674,7 @@ static void cache_set_free(struct closure *cl)
 	for_each_cache(ca, c, i)
 		if (ca) {
 			ca->set = NULL;
-			c->cache[ca->sb.nr_this_dev] = NULL;
+			c->cache = NULL;
 			kobject_put(&ca->kobj);
 		}
 
@@ -2165,7 +2165,7 @@ static const char *register_cache_set(struct cache *ca)
 
 	list_for_each_entry(c, &bch_cache_sets, list)
 		if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
-			if (c->cache[ca->sb.nr_this_dev])
+			if (c->cache)
 				return "duplicate cache set member";
 
 			if (!can_attach_cache(ca, c))
@@ -2215,14 +2215,11 @@ found:
 
 	kobject_get(&ca->kobj);
 	ca->set = c;
-	ca->set->cache[ca->sb.nr_this_dev] = ca;
-	c->cache_by_alloc[c->caches_loaded++] = ca;
+	ca->set->cache = ca;
 
-	if (c->caches_loaded == c->sb.nr_in_set) {
-		err = "failed to run cache set";
-		if (run_cache_set(c) < 0)
-			goto err;
-	}
+	err = "failed to run cache set";
+	if (run_cache_set(c) < 0)
+		goto err;
 
 	return NULL;
 err:
@@ -2239,8 +2236,8 @@ void bch_cache_release(struct kobject *kobj)
 	unsigned int i;
 
 	if (ca->set) {
-		BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
-		ca->set->cache[ca->sb.nr_this_dev] = NULL;
+		BUG_ON(ca->set->cache != ca);
+		ca->set->cache = NULL;
 	}
 
 	free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
-- 
cgit v1.2.3


From 08fdb2cddbdc783fecadfb606868c4498165fc30 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:47 +0800
Subject: bcache: remove for_each_cache()

Since now each cache_set explicitly has single cache, for_each_cache()
is unnecessary. This patch removes this macro, and update all locations
where it is used, and makes sure all code logic still being consistent.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c    |  17 ++--
 drivers/md/bcache/bcache.h   |   9 +-
 drivers/md/bcache/btree.c    | 103 +++++++++----------
 drivers/md/bcache/journal.c  | 229 ++++++++++++++++++++-----------------------
 drivers/md/bcache/movinggc.c |  58 ++++++-----
 drivers/md/bcache/super.c    | 115 +++++++++-------------
 6 files changed, 237 insertions(+), 294 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 3385f6add6df..1b8310992dd0 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -88,7 +88,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 	struct cache *ca;
 	struct bucket *b;
 	unsigned long next = c->nbuckets * c->sb.bucket_size / 1024;
-	unsigned int i;
 	int r;
 
 	atomic_sub(sectors, &c->rescale);
@@ -104,14 +103,14 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 
 	c->min_prio = USHRT_MAX;
 
-	for_each_cache(ca, c, i)
-		for_each_bucket(b, ca)
-			if (b->prio &&
-			    b->prio != BTREE_PRIO &&
-			    !atomic_read(&b->pin)) {
-				b->prio--;
-				c->min_prio = min(c->min_prio, b->prio);
-			}
+	ca = c->cache;
+	for_each_bucket(b, ca)
+		if (b->prio &&
+		    b->prio != BTREE_PRIO &&
+		    !atomic_read(&b->pin)) {
+			b->prio--;
+			c->min_prio = min(c->min_prio, b->prio);
+		}
 
 	mutex_unlock(&c->bucket_lock);
 }
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index aa112c1adba1..7ffe6b2d179b 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -887,9 +887,6 @@ do {									\
 
 /* Looping macros */
 
-#define for_each_cache(ca, cs, iter)					\
-	for (iter = 0; ca = cs->cache, iter < 1; iter++)
-
 #define for_each_bucket(b, ca)						\
 	for (b = (ca)->buckets + (ca)->sb.first_bucket;			\
 	     b < (ca)->buckets + (ca)->sb.nbuckets; b++)
@@ -931,11 +928,9 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
 
 static inline void wake_up_allocators(struct cache_set *c)
 {
-	struct cache *ca;
-	unsigned int i;
+	struct cache *ca = c->cache;
 
-	for_each_cache(ca, c, i)
-		wake_up_process(ca->alloc_thread);
+	wake_up_process(ca->alloc_thread);
 }
 
 static inline void closure_bio_submit(struct cache_set *c,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index e2bd03408e66..f626d536981e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1167,19 +1167,18 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
 static int btree_check_reserve(struct btree *b, struct btree_op *op)
 {
 	struct cache_set *c = b->c;
-	struct cache *ca;
-	unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
+	struct cache *ca = c->cache;
+	unsigned int reserve = (c->root->level - b->level) * 2 + 1;
 
 	mutex_lock(&c->bucket_lock);
 
-	for_each_cache(ca, c, i)
-		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
-			if (op)
-				prepare_to_wait(&c->btree_cache_wait, &op->wait,
-						TASK_UNINTERRUPTIBLE);
-			mutex_unlock(&c->bucket_lock);
-			return -EINTR;
-		}
+	if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
+		if (op)
+			prepare_to_wait(&c->btree_cache_wait, &op->wait,
+					TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&c->bucket_lock);
+		return -EINTR;
+	}
 
 	mutex_unlock(&c->bucket_lock);
 
@@ -1695,7 +1694,6 @@ static void btree_gc_start(struct cache_set *c)
 {
 	struct cache *ca;
 	struct bucket *b;
-	unsigned int i;
 
 	if (!c->gc_mark_valid)
 		return;
@@ -1705,14 +1703,14 @@ static void btree_gc_start(struct cache_set *c)
 	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 
-	for_each_cache(ca, c, i)
-		for_each_bucket(b, ca) {
-			b->last_gc = b->gen;
-			if (!atomic_read(&b->pin)) {
-				SET_GC_MARK(b, 0);
-				SET_GC_SECTORS_USED(b, 0);
-			}
+	ca = c->cache;
+	for_each_bucket(b, ca) {
+		b->last_gc = b->gen;
+		if (!atomic_read(&b->pin)) {
+			SET_GC_MARK(b, 0);
+			SET_GC_SECTORS_USED(b, 0);
 		}
+	}
 
 	mutex_unlock(&c->bucket_lock);
 }
@@ -1721,7 +1719,8 @@ static void bch_btree_gc_finish(struct cache_set *c)
 {
 	struct bucket *b;
 	struct cache *ca;
-	unsigned int i;
+	unsigned int i, j;
+	uint64_t *k;
 
 	mutex_lock(&c->bucket_lock);
 
@@ -1739,7 +1738,6 @@ static void bch_btree_gc_finish(struct cache_set *c)
 		struct bcache_device *d = c->devices[i];
 		struct cached_dev *dc;
 		struct keybuf_key *w, *n;
-		unsigned int j;
 
 		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
 			continue;
@@ -1756,29 +1754,27 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	rcu_read_unlock();
 
 	c->avail_nbuckets = 0;
-	for_each_cache(ca, c, i) {
-		uint64_t *i;
 
-		ca->invalidate_needs_gc = 0;
+	ca = c->cache;
+	ca->invalidate_needs_gc = 0;
 
-		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
-			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+		SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
 
-		for (i = ca->prio_buckets;
-		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
-			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+	for (k = ca->prio_buckets;
+	     k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+		SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
 
-		for_each_bucket(b, ca) {
-			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
+	for_each_bucket(b, ca) {
+		c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
 
-			if (atomic_read(&b->pin))
-				continue;
+		if (atomic_read(&b->pin))
+			continue;
 
-			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
+		BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
 
-			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
-				c->avail_nbuckets++;
-		}
+		if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
+			c->avail_nbuckets++;
 	}
 
 	mutex_unlock(&c->bucket_lock);
@@ -1830,12 +1826,10 @@ static void bch_btree_gc(struct cache_set *c)
 
 static bool gc_should_run(struct cache_set *c)
 {
-	struct cache *ca;
-	unsigned int i;
+	struct cache *ca = c->cache;
 
-	for_each_cache(ca, c, i)
-		if (ca->invalidate_needs_gc)
-			return true;
+	if (ca->invalidate_needs_gc)
+		return true;
 
 	if (atomic_read(&c->sectors_to_gc) < 0)
 		return true;
@@ -2081,9 +2075,8 @@ out:
 
 void bch_initial_gc_finish(struct cache_set *c)
 {
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	struct bucket *b;
-	unsigned int i;
 
 	bch_btree_gc_finish(c);
 
@@ -2098,20 +2091,18 @@ void bch_initial_gc_finish(struct cache_set *c)
 	 * This is only safe for buckets that have no live data in them, which
 	 * there should always be some of.
 	 */
-	for_each_cache(ca, c, i) {
-		for_each_bucket(b, ca) {
-			if (fifo_full(&ca->free[RESERVE_PRIO]) &&
-			    fifo_full(&ca->free[RESERVE_BTREE]))
-				break;
+	for_each_bucket(b, ca) {
+		if (fifo_full(&ca->free[RESERVE_PRIO]) &&
+		    fifo_full(&ca->free[RESERVE_BTREE]))
+			break;
 
-			if (bch_can_invalidate_bucket(ca, b) &&
-			    !GC_MARK(b)) {
-				__bch_invalidate_one_bucket(ca, b);
-				if (!fifo_push(&ca->free[RESERVE_PRIO],
-				   b - ca->buckets))
-					fifo_push(&ca->free[RESERVE_BTREE],
-						  b - ca->buckets);
-			}
+		if (bch_can_invalidate_bucket(ca, b) &&
+		    !GC_MARK(b)) {
+			__bch_invalidate_one_bucket(ca, b);
+			if (!fifo_push(&ca->free[RESERVE_PRIO],
+			   b - ca->buckets))
+				fifo_push(&ca->free[RESERVE_BTREE],
+					  b - ca->buckets);
 		}
 	}
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index c1227bdb57e7..e89ae7c4ba97 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -179,112 +179,109 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 		ret;							\
 	})
 
-	struct cache *ca;
-	unsigned int iter;
+	struct cache *ca = c->cache;
 	int ret = 0;
+	struct journal_device *ja = &ca->journal;
+	DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
+	unsigned int i, l, r, m;
+	uint64_t seq;
 
-	for_each_cache(ca, c, iter) {
-		struct journal_device *ja = &ca->journal;
-		DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
-		unsigned int i, l, r, m;
-		uint64_t seq;
-
-		bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
-		pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
+	bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+	pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
 
+	/*
+	 * Read journal buckets ordered by golden ratio hash to quickly
+	 * find a sequence of buckets with valid journal entries
+	 */
+	for (i = 0; i < ca->sb.njournal_buckets; i++) {
 		/*
-		 * Read journal buckets ordered by golden ratio hash to quickly
-		 * find a sequence of buckets with valid journal entries
+		 * We must try the index l with ZERO first for
+		 * correctness due to the scenario that the journal
+		 * bucket is circular buffer which might have wrapped
 		 */
-		for (i = 0; i < ca->sb.njournal_buckets; i++) {
-			/*
-			 * We must try the index l with ZERO first for
-			 * correctness due to the scenario that the journal
-			 * bucket is circular buffer which might have wrapped
-			 */
-			l = (i * 2654435769U) % ca->sb.njournal_buckets;
+		l = (i * 2654435769U) % ca->sb.njournal_buckets;
 
-			if (test_bit(l, bitmap))
-				break;
+		if (test_bit(l, bitmap))
+			break;
 
-			if (read_bucket(l))
-				goto bsearch;
-		}
+		if (read_bucket(l))
+			goto bsearch;
+	}
 
-		/*
-		 * If that fails, check all the buckets we haven't checked
-		 * already
-		 */
-		pr_debug("falling back to linear search\n");
+	/*
+	 * If that fails, check all the buckets we haven't checked
+	 * already
+	 */
+	pr_debug("falling back to linear search\n");
 
-		for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets)
-			if (read_bucket(l))
-				goto bsearch;
+	for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets)
+		if (read_bucket(l))
+			goto bsearch;
 
-		/* no journal entries on this device? */
-		if (l == ca->sb.njournal_buckets)
-			continue;
+	/* no journal entries on this device? */
+	if (l == ca->sb.njournal_buckets)
+		goto out;
 bsearch:
-		BUG_ON(list_empty(list));
+	BUG_ON(list_empty(list));
 
-		/* Binary search */
-		m = l;
-		r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
-		pr_debug("starting binary search, l %u r %u\n", l, r);
+	/* Binary search */
+	m = l;
+	r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+	pr_debug("starting binary search, l %u r %u\n", l, r);
 
-		while (l + 1 < r) {
-			seq = list_entry(list->prev, struct journal_replay,
-					 list)->j.seq;
+	while (l + 1 < r) {
+		seq = list_entry(list->prev, struct journal_replay,
+				 list)->j.seq;
 
-			m = (l + r) >> 1;
-			read_bucket(m);
+		m = (l + r) >> 1;
+		read_bucket(m);
 
-			if (seq != list_entry(list->prev, struct journal_replay,
-					      list)->j.seq)
-				l = m;
-			else
-				r = m;
-		}
+		if (seq != list_entry(list->prev, struct journal_replay,
+				      list)->j.seq)
+			l = m;
+		else
+			r = m;
+	}
 
-		/*
-		 * Read buckets in reverse order until we stop finding more
-		 * journal entries
-		 */
-		pr_debug("finishing up: m %u njournal_buckets %u\n",
-			 m, ca->sb.njournal_buckets);
-		l = m;
+	/*
+	 * Read buckets in reverse order until we stop finding more
+	 * journal entries
+	 */
+	pr_debug("finishing up: m %u njournal_buckets %u\n",
+		 m, ca->sb.njournal_buckets);
+	l = m;
 
-		while (1) {
-			if (!l--)
-				l = ca->sb.njournal_buckets - 1;
+	while (1) {
+		if (!l--)
+			l = ca->sb.njournal_buckets - 1;
 
-			if (l == m)
-				break;
+		if (l == m)
+			break;
 
-			if (test_bit(l, bitmap))
-				continue;
+		if (test_bit(l, bitmap))
+			continue;
 
-			if (!read_bucket(l))
-				break;
-		}
+		if (!read_bucket(l))
+			break;
+	}
 
-		seq = 0;
+	seq = 0;
 
-		for (i = 0; i < ca->sb.njournal_buckets; i++)
-			if (ja->seq[i] > seq) {
-				seq = ja->seq[i];
-				/*
-				 * When journal_reclaim() goes to allocate for
-				 * the first time, it'll use the bucket after
-				 * ja->cur_idx
-				 */
-				ja->cur_idx = i;
-				ja->last_idx = ja->discard_idx = (i + 1) %
-					ca->sb.njournal_buckets;
+	for (i = 0; i < ca->sb.njournal_buckets; i++)
+		if (ja->seq[i] > seq) {
+			seq = ja->seq[i];
+			/*
+			 * When journal_reclaim() goes to allocate for
+			 * the first time, it'll use the bucket after
+			 * ja->cur_idx
+			 */
+			ja->cur_idx = i;
+			ja->last_idx = ja->discard_idx = (i + 1) %
+				ca->sb.njournal_buckets;
 
-			}
-	}
+		}
 
+out:
 	if (!list_empty(list))
 		c->journal.seq = list_entry(list->prev,
 					    struct journal_replay,
@@ -342,12 +339,10 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 
 static bool is_discard_enabled(struct cache_set *s)
 {
-	struct cache *ca;
-	unsigned int i;
+	struct cache *ca = s->cache;
 
-	for_each_cache(ca, s, i)
-		if (ca->discard)
-			return true;
+	if (ca->discard)
+		return true;
 
 	return false;
 }
@@ -633,9 +628,10 @@ static void do_journal_discard(struct cache *ca)
 static void journal_reclaim(struct cache_set *c)
 {
 	struct bkey *k = &c->journal.key;
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	uint64_t last_seq;
-	unsigned int iter, n = 0;
+	unsigned int next;
+	struct journal_device *ja = &ca->journal;
 	atomic_t p __maybe_unused;
 
 	atomic_long_inc(&c->reclaim);
@@ -647,46 +643,31 @@ static void journal_reclaim(struct cache_set *c)
 
 	/* Update last_idx */
 
-	for_each_cache(ca, c, iter) {
-		struct journal_device *ja = &ca->journal;
-
-		while (ja->last_idx != ja->cur_idx &&
-		       ja->seq[ja->last_idx] < last_seq)
-			ja->last_idx = (ja->last_idx + 1) %
-				ca->sb.njournal_buckets;
-	}
+	while (ja->last_idx != ja->cur_idx &&
+	       ja->seq[ja->last_idx] < last_seq)
+		ja->last_idx = (ja->last_idx + 1) %
+			ca->sb.njournal_buckets;
 
-	for_each_cache(ca, c, iter)
-		do_journal_discard(ca);
+	do_journal_discard(ca);
 
 	if (c->journal.blocks_free)
 		goto out;
 
-	/*
-	 * Allocate:
-	 * XXX: Sort by free journal space
-	 */
-
-	for_each_cache(ca, c, iter) {
-		struct journal_device *ja = &ca->journal;
-		unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+	next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+	/* No space available on this device */
+	if (next == ja->discard_idx)
+		goto out;
 
-		/* No space available on this device */
-		if (next == ja->discard_idx)
-			continue;
+	ja->cur_idx = next;
+	k->ptr[0] = MAKE_PTR(0,
+			     bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+			     ca->sb.nr_this_dev);
+	atomic_long_inc(&c->reclaimed_journal_buckets);
 
-		ja->cur_idx = next;
-		k->ptr[n++] = MAKE_PTR(0,
-				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
-				  ca->sb.nr_this_dev);
-		atomic_long_inc(&c->reclaimed_journal_buckets);
-	}
+	bkey_init(k);
+	SET_KEY_PTRS(k, 1);
+	c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
 
-	if (n) {
-		bkey_init(k);
-		SET_KEY_PTRS(k, n);
-		c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
-	}
 out:
 	if (!journal_full(&c->journal))
 		__closure_wake_up(&c->journal.wait);
@@ -750,7 +731,7 @@ static void journal_write_unlocked(struct closure *cl)
 	__releases(c->journal.lock)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
 	unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) *
@@ -780,9 +761,7 @@ static void journal_write_unlocked(struct closure *cl)
 	bkey_copy(&w->data->btree_root, &c->root->key);
 	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
 
-	for_each_cache(ca, c, i)
-		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
-
+	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
 	w->data->magic		= jset_magic(&c->sb);
 	w->data->version	= BCACHE_JSET_VERSION;
 	w->data->last_seq	= last_seq(&c->journal);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 5872d6470470..b9c3d27ec093 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -196,50 +196,48 @@ static unsigned int bucket_heap_top(struct cache *ca)
 
 void bch_moving_gc(struct cache_set *c)
 {
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	struct bucket *b;
-	unsigned int i;
+	unsigned long sectors_to_move, reserve_sectors;
 
 	if (!c->copy_gc_enabled)
 		return;
 
 	mutex_lock(&c->bucket_lock);
 
-	for_each_cache(ca, c, i) {
-		unsigned long sectors_to_move = 0;
-		unsigned long reserve_sectors = ca->sb.bucket_size *
+	sectors_to_move = 0;
+	reserve_sectors = ca->sb.bucket_size *
 			     fifo_used(&ca->free[RESERVE_MOVINGGC]);
 
-		ca->heap.used = 0;
-
-		for_each_bucket(b, ca) {
-			if (GC_MARK(b) == GC_MARK_METADATA ||
-			    !GC_SECTORS_USED(b) ||
-			    GC_SECTORS_USED(b) == ca->sb.bucket_size ||
-			    atomic_read(&b->pin))
-				continue;
-
-			if (!heap_full(&ca->heap)) {
-				sectors_to_move += GC_SECTORS_USED(b);
-				heap_add(&ca->heap, b, bucket_cmp);
-			} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
-				sectors_to_move -= bucket_heap_top(ca);
-				sectors_to_move += GC_SECTORS_USED(b);
-
-				ca->heap.data[0] = b;
-				heap_sift(&ca->heap, 0, bucket_cmp);
-			}
-		}
+	ca->heap.used = 0;
+
+	for_each_bucket(b, ca) {
+		if (GC_MARK(b) == GC_MARK_METADATA ||
+		    !GC_SECTORS_USED(b) ||
+		    GC_SECTORS_USED(b) == ca->sb.bucket_size ||
+		    atomic_read(&b->pin))
+			continue;
 
-		while (sectors_to_move > reserve_sectors) {
-			heap_pop(&ca->heap, b, bucket_cmp);
-			sectors_to_move -= GC_SECTORS_USED(b);
+		if (!heap_full(&ca->heap)) {
+			sectors_to_move += GC_SECTORS_USED(b);
+			heap_add(&ca->heap, b, bucket_cmp);
+		} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+			sectors_to_move -= bucket_heap_top(ca);
+			sectors_to_move += GC_SECTORS_USED(b);
+
+			ca->heap.data[0] = b;
+			heap_sift(&ca->heap, 0, bucket_cmp);
 		}
+	}
 
-		while (heap_pop(&ca->heap, b, bucket_cmp))
-			SET_GC_MOVE(b, 1);
+	while (sectors_to_move > reserve_sectors) {
+		heap_pop(&ca->heap, b, bucket_cmp);
+		sectors_to_move -= GC_SECTORS_USED(b);
 	}
 
+	while (heap_pop(&ca->heap, b, bucket_cmp))
+		SET_GC_MOVE(b, 1);
+
 	mutex_unlock(&c->bucket_lock);
 
 	c->moving_gc_keys.last_scanned = ZERO_KEY;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 739cf292554c..1eb55a600ed7 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -343,8 +343,9 @@ static void bcache_write_super_unlock(struct closure *cl)
 void bcache_write_super(struct cache_set *c)
 {
 	struct closure *cl = &c->sb_write;
-	struct cache *ca;
-	unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
+	struct cache *ca = c->cache;
+	struct bio *bio = &ca->sb_bio;
+	unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
 
 	down(&c->sb_write_mutex);
 	closure_init(cl, &c->cl);
@@ -354,23 +355,19 @@ void bcache_write_super(struct cache_set *c)
 	if (c->sb.version > version)
 		version = c->sb.version;
 
-	for_each_cache(ca, c, i) {
-		struct bio *bio = &ca->sb_bio;
-
-		ca->sb.version		= version;
-		ca->sb.seq		= c->sb.seq;
-		ca->sb.last_mount	= c->sb.last_mount;
+	ca->sb.version		= version;
+	ca->sb.seq		= c->sb.seq;
+	ca->sb.last_mount	= c->sb.last_mount;
 
-		SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
+	SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
 
-		bio_init(bio, ca->sb_bv, 1);
-		bio_set_dev(bio, ca->bdev);
-		bio->bi_end_io	= write_super_endio;
-		bio->bi_private = ca;
+	bio_init(bio, ca->sb_bv, 1);
+	bio_set_dev(bio, ca->bdev);
+	bio->bi_end_io	= write_super_endio;
+	bio->bi_private = ca;
 
-		closure_get(cl);
-		__write_super(&ca->sb, ca->sb_disk, bio);
-	}
+	closure_get(cl);
+	__write_super(&ca->sb, ca->sb_disk, bio);
 
 	closure_return_with_destructor(cl, bcache_write_super_unlock);
 }
@@ -772,26 +769,22 @@ static void bcache_device_unlink(struct bcache_device *d)
 	lockdep_assert_held(&bch_register_lock);
 
 	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
-		unsigned int i;
-		struct cache *ca;
+		struct cache *ca = d->c->cache;
 
 		sysfs_remove_link(&d->c->kobj, d->name);
 		sysfs_remove_link(&d->kobj, "cache");
 
-		for_each_cache(ca, d->c, i)
-			bd_unlink_disk_holder(ca->bdev, d->disk);
+		bd_unlink_disk_holder(ca->bdev, d->disk);
 	}
 }
 
 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
 			       const char *name)
 {
-	unsigned int i;
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	int ret;
 
-	for_each_cache(ca, d->c, i)
-		bd_link_disk_holder(ca->bdev, d->disk);
+	bd_link_disk_holder(ca->bdev, d->disk);
 
 	snprintf(d->name, BCACHEDEVNAME_SIZE,
 		 "%s%u", name, d->id);
@@ -1662,7 +1655,6 @@ static void cache_set_free(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, cl);
 	struct cache *ca;
-	unsigned int i;
 
 	debugfs_remove(c->debug);
 
@@ -1671,12 +1663,12 @@ static void cache_set_free(struct closure *cl)
 	bch_journal_free(c);
 
 	mutex_lock(&bch_register_lock);
-	for_each_cache(ca, c, i)
-		if (ca) {
-			ca->set = NULL;
-			c->cache = NULL;
-			kobject_put(&ca->kobj);
-		}
+	ca = c->cache;
+	if (ca) {
+		ca->set = NULL;
+		c->cache = NULL;
+		kobject_put(&ca->kobj);
+	}
 
 	bch_bset_sort_state_free(&c->sort);
 	free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb)));
@@ -1702,9 +1694,8 @@ static void cache_set_free(struct closure *cl)
 static void cache_set_flush(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, caching);
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	struct btree *b;
-	unsigned int i;
 
 	bch_cache_accounting_destroy(&c->accounting);
 
@@ -1729,9 +1720,8 @@ static void cache_set_flush(struct closure *cl)
 			mutex_unlock(&b->write_lock);
 		}
 
-	for_each_cache(ca, c, i)
-		if (ca->alloc_thread)
-			kthread_stop(ca->alloc_thread);
+	if (ca->alloc_thread)
+		kthread_stop(ca->alloc_thread);
 
 	if (c->journal.cur) {
 		cancel_delayed_work_sync(&c->journal.work);
@@ -1972,16 +1962,14 @@ static int run_cache_set(struct cache_set *c)
 {
 	const char *err = "cannot allocate memory";
 	struct cached_dev *dc, *t;
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	struct closure cl;
-	unsigned int i;
 	LIST_HEAD(journal);
 	struct journal_replay *l;
 
 	closure_init_stack(&cl);
 
-	for_each_cache(ca, c, i)
-		c->nbuckets += ca->sb.nbuckets;
+	c->nbuckets = ca->sb.nbuckets;
 	set_gc_sectors(c);
 
 	if (CACHE_SYNC(&c->sb)) {
@@ -2001,10 +1989,8 @@ static int run_cache_set(struct cache_set *c)
 		j = &list_entry(journal.prev, struct journal_replay, list)->j;
 
 		err = "IO error reading priorities";
-		for_each_cache(ca, c, i) {
-			if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
-				goto err;
-		}
+		if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
+			goto err;
 
 		/*
 		 * If prio_read() fails it'll call cache_set_error and we'll
@@ -2048,9 +2034,8 @@ static int run_cache_set(struct cache_set *c)
 		bch_journal_next(&c->journal);
 
 		err = "error starting allocator thread";
-		for_each_cache(ca, c, i)
-			if (bch_cache_allocator_start(ca))
-				goto err;
+		if (bch_cache_allocator_start(ca))
+			goto err;
 
 		/*
 		 * First place it's safe to allocate: btree_check() and
@@ -2069,28 +2054,23 @@ static int run_cache_set(struct cache_set *c)
 		if (bch_journal_replay(c, &journal))
 			goto err;
 	} else {
-		pr_notice("invalidating existing data\n");
-
-		for_each_cache(ca, c, i) {
-			unsigned int j;
+		unsigned int j;
 
-			ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
-					      2, SB_JOURNAL_BUCKETS);
+		pr_notice("invalidating existing data\n");
+		ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+					2, SB_JOURNAL_BUCKETS);
 
-			for (j = 0; j < ca->sb.keys; j++)
-				ca->sb.d[j] = ca->sb.first_bucket + j;
-		}
+		for (j = 0; j < ca->sb.keys; j++)
+			ca->sb.d[j] = ca->sb.first_bucket + j;
 
 		bch_initial_gc_finish(c);
 
 		err = "error starting allocator thread";
-		for_each_cache(ca, c, i)
-			if (bch_cache_allocator_start(ca))
-				goto err;
+		if (bch_cache_allocator_start(ca))
+			goto err;
 
 		mutex_lock(&c->bucket_lock);
-		for_each_cache(ca, c, i)
-			bch_prio_write(ca, true);
+		bch_prio_write(ca, true);
 		mutex_unlock(&c->bucket_lock);
 
 		err = "cannot allocate new UUID bucket";
@@ -2465,13 +2445,14 @@ static bool bch_is_open_backing(struct block_device *bdev)
 static bool bch_is_open_cache(struct block_device *bdev)
 {
 	struct cache_set *c, *tc;
-	struct cache *ca;
-	unsigned int i;
 
-	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
-		for_each_cache(ca, c, i)
-			if (ca->bdev == bdev)
-				return true;
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
+		struct cache *ca = c->cache;
+
+		if (ca->bdev == bdev)
+			return true;
+	}
+
 	return false;
 }
 
-- 
cgit v1.2.3


From 1132e56e78e3c8f4deaf68ed760c78456a9c61f4 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:48 +0800
Subject: bcache: add set_uuid in struct cache_set

This patch adds a separated set_uuid[16] in struct cache_set, to store
the uuid of the cache set. This is the preparation to remove the
embedded struct cache_sb from struct cache_set.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h    |  1 +
 drivers/md/bcache/debug.c     |  2 +-
 drivers/md/bcache/super.c     | 24 ++++++++++++------------
 include/trace/events/bcache.h |  4 ++--
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 7ffe6b2d179b..94a62acac4fc 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -668,6 +668,7 @@ struct cache_set {
 	struct mutex		verify_lock;
 #endif
 
+	uint8_t			set_uuid[16];
 	unsigned int		nr_uuids;
 	struct uuid_entry	*uuids;
 	BKEY_PADDED(uuid_bucket);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 336f43910383..0ccc1b0baa42 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -238,7 +238,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
 	if (!IS_ERR_OR_NULL(bcache_debug)) {
 		char name[50];
 
-		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
+		snprintf(name, 50, "bcache-%pU", c->set_uuid);
 		c->debug = debugfs_create_file(name, 0400, bcache_debug, c,
 					       &cache_set_debug_ops);
 	}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1eb55a600ed7..b84487bd8a38 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1189,8 +1189,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	struct cached_dev *exist_dc, *t;
 	int ret = 0;
 
-	if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
-	    (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
+	if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
+	    (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
 		return -ENOENT;
 
 	if (dc->disk.c) {
@@ -1262,7 +1262,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 		u->first_reg = u->last_reg = rtime;
 		bch_uuid_write(c);
 
-		memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
+		memcpy(dc->sb.set_uuid, c->set_uuid, 16);
 		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
 
 		bch_write_bdev_super(dc, &cl);
@@ -1324,7 +1324,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	pr_info("Caching %s as %s on set %pU\n",
 		dc->backing_dev_name,
 		dc->disk.disk->disk_name,
-		dc->disk.c->sb.set_uuid);
+		dc->disk.c->set_uuid);
 	return 0;
 }
 
@@ -1631,7 +1631,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
 	vaf.va = &args;
 
 	pr_err("error on %pU: %pV, disabling caching\n",
-	       c->sb.set_uuid, &vaf);
+	       c->set_uuid, &vaf);
 
 	va_end(args);
 
@@ -1684,7 +1684,7 @@ static void cache_set_free(struct closure *cl)
 	list_del(&c->list);
 	mutex_unlock(&bch_register_lock);
 
-	pr_info("Cache set %pU unregistered\n", c->sb.set_uuid);
+	pr_info("Cache set %pU unregistered\n", c->set_uuid);
 	wake_up(&unregister_wait);
 
 	closure_debug_destroy(&c->cl);
@@ -1754,7 +1754,7 @@ static void conditional_stop_bcache_device(struct cache_set *c,
 {
 	if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
 		pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
-			d->disk->disk_name, c->sb.set_uuid);
+			d->disk->disk_name, c->set_uuid);
 		bcache_device_stop(d);
 	} else if (atomic_read(&dc->has_dirty)) {
 		/*
@@ -1861,7 +1861,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 
 	bch_cache_accounting_init(&c->accounting, &c->cl);
 
-	memcpy(c->sb.set_uuid, sb->set_uuid, 16);
+	memcpy(c->set_uuid, sb->set_uuid, 16);
 	c->sb.block_size	= sb->block_size;
 	c->sb.bucket_size	= sb->bucket_size;
 	c->sb.nr_in_set		= sb->nr_in_set;
@@ -2144,7 +2144,7 @@ static const char *register_cache_set(struct cache *ca)
 	struct cache_set *c;
 
 	list_for_each_entry(c, &bch_cache_sets, list)
-		if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
+		if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
 			if (c->cache)
 				return "duplicate cache set member";
 
@@ -2162,7 +2162,7 @@ static const char *register_cache_set(struct cache *ca)
 		return err;
 
 	err = "error creating kobject";
-	if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
+	if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
 	    kobject_add(&c->internal, &c->kobj, "internal"))
 		goto err;
 
@@ -2187,7 +2187,7 @@ found:
 	 */
 	if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
 		c->sb.version		= ca->sb.version;
-		memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
+		memcpy(c->set_uuid, ca->sb.set_uuid, 16);
 		c->sb.flags             = ca->sb.flags;
 		c->sb.seq		= ca->sb.seq;
 		pr_debug("set version = %llu\n", c->sb.version);
@@ -2702,7 +2702,7 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
 	list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
 		list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
 			char *pdev_set_uuid = pdev->dc->sb.set_uuid;
-			char *set_uuid = c->sb.uuid;
+			char *set_uuid = c->set_uuid;
 
 			if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
 				list_del(&pdev->list);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 0bddea663b3b..e41c611d6d3b 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -164,7 +164,7 @@ TRACE_EVENT(bcache_write,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.set_uuid, 16);
+		memcpy(__entry->uuid, c->set_uuid, 16);
 		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
@@ -200,7 +200,7 @@ DECLARE_EVENT_CLASS(cache_set,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.set_uuid, 16);
+		memcpy(__entry->uuid, c->set_uuid, 16);
 	),
 
 	TP_printk("%pU", __entry->uuid)
-- 
cgit v1.2.3


From 4e1ebae3ee4e0ce384c33832f66e417a965b64bc Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:49 +0800
Subject: bcache: only use block_bytes() on struct cache

Because struct cache_set and struct cache both have struct cache_sb,
therefore macro block_bytes() can be used on both of them. When removing
the embedded struct cache_sb from struct cache_set, this macro won't be
used on struct cache_set anymore.

This patch unifies all block_bytes() usage only on struct cache, this is
one of the preparation to remove the embedded struct cache_sb from
struct cache_set.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h  |  2 +-
 drivers/md/bcache/btree.c   | 24 ++++++++++++------------
 drivers/md/bcache/debug.c   |  8 ++++----
 drivers/md/bcache/journal.c |  8 ++++----
 drivers/md/bcache/request.c |  2 +-
 drivers/md/bcache/super.c   |  2 +-
 drivers/md/bcache/sysfs.c   |  2 +-
 7 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 94a62acac4fc..29bec61cafbb 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -759,7 +759,7 @@ struct bbio {
 
 #define bucket_pages(c)		((c)->sb.bucket_size / PAGE_SECTORS)
 #define bucket_bytes(c)		((c)->sb.bucket_size << 9)
-#define block_bytes(c)		((c)->sb.block_size << 9)
+#define block_bytes(ca)		((ca)->sb.block_size << 9)
 
 static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
 {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f626d536981e..1f3d4870a8b2 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -104,7 +104,7 @@
 
 static inline struct bset *write_block(struct btree *b)
 {
-	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache);
 }
 
 static void bch_btree_init_next(struct btree *b)
@@ -173,7 +173,7 @@ void bch_btree_node_read_done(struct btree *b)
 			goto err;
 
 		err = "bad btree header";
-		if (b->written + set_blocks(i, block_bytes(b->c)) >
+		if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
 		    btree_blocks(b))
 			goto err;
 
@@ -199,13 +199,13 @@ void bch_btree_node_read_done(struct btree *b)
 
 		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
 
-		b->written += set_blocks(i, block_bytes(b->c));
+		b->written += set_blocks(i, block_bytes(b->c->cache));
 	}
 
 	err = "corrupted btree";
 	for (i = write_block(b);
 	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
-	     i = ((void *) i) + block_bytes(b->c))
+	     i = ((void *) i) + block_bytes(b->c->cache))
 		if (i->seq == b->keys.set[0].data->seq)
 			goto err;
 
@@ -347,7 +347,7 @@ static void do_btree_node_write(struct btree *b)
 
 	b->bio->bi_end_io	= btree_node_write_endio;
 	b->bio->bi_private	= cl;
-	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
+	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c->cache));
 	b->bio->bi_opf		= REQ_OP_WRITE | REQ_META | REQ_FUA;
 	bch_bio_map(b->bio, i);
 
@@ -423,10 +423,10 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)
 
 	do_btree_node_write(b);
 
-	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
+	atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->sb.block_size,
 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 
-	b->written += set_blocks(i, block_bytes(b->c));
+	b->written += set_blocks(i, block_bytes(b->c->cache));
 }
 
 void bch_btree_node_write(struct btree *b, struct closure *parent)
@@ -1344,7 +1344,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 
 	if (nodes < 2 ||
 	    __set_blocks(b->keys.set[0].data, keys,
-			 block_bytes(b->c)) > blocks * (nodes - 1))
+			 block_bytes(b->c->cache)) > blocks * (nodes - 1))
 		return 0;
 
 	for (i = 0; i < nodes; i++) {
@@ -1378,7 +1378,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			     k = bkey_next(k)) {
 				if (__set_blocks(n1, n1->keys + keys +
 						 bkey_u64s(k),
-						 block_bytes(b->c)) > blocks)
+						 block_bytes(b->c->cache)) > blocks)
 					break;
 
 				last = k;
@@ -1394,7 +1394,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			 * though)
 			 */
 			if (__set_blocks(n1, n1->keys + n2->keys,
-					 block_bytes(b->c)) >
+					 block_bytes(b->c->cache)) >
 			    btree_blocks(new_nodes[i]))
 				goto out_unlock_nocoalesce;
 
@@ -1403,7 +1403,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			last = &r->b->key;
 		}
 
-		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
+		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
 		       btree_blocks(new_nodes[i]));
 
 		if (last)
@@ -2210,7 +2210,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		goto err;
 
 	split = set_blocks(btree_bset_first(n1),
-			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
+			   block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5;
 
 	if (split) {
 		unsigned int keys = 0;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 0ccc1b0baa42..b00fd08d696b 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -25,8 +25,8 @@ struct dentry *bcache_debug;
 	for (i = (start);						\
 	     (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
 	     i->seq == (start)->seq;					\
-	     i = (void *) i + set_blocks(i, block_bytes(b->c)) *	\
-		 block_bytes(b->c))
+	     i = (void *) i + set_blocks(i, block_bytes(b->c->cache)) *	\
+		 block_bytes(b->c->cache))
 
 void bch_btree_verify(struct btree *b)
 {
@@ -82,14 +82,14 @@ void bch_btree_verify(struct btree *b)
 
 		for_each_written_bset(b, ondisk, i) {
 			unsigned int block = ((void *) i - (void *) ondisk) /
-				block_bytes(b->c);
+				block_bytes(b->c->cache);
 
 			pr_err("*** on disk block %u:\n", block);
 			bch_dump_bset(&b->keys, i, block);
 		}
 
 		pr_err("*** block %zu not written\n",
-		       ((void *) i - (void *) ondisk) / block_bytes(b->c));
+		       ((void *) i - (void *) ondisk) / block_bytes(b->c->cache));
 
 		for (j = 0; j < inmemory->keys; j++)
 			if (inmemory->d[j] != sorted->d[j])
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index e89ae7c4ba97..ab0d06e4851d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -98,7 +98,7 @@ reread:		left = ca->sb.bucket_size - offset;
 				return ret;
 			}
 
-			blocks = set_blocks(j, block_bytes(ca->set));
+			blocks = set_blocks(j, block_bytes(ca));
 
 			/*
 			 * Nodes in 'list' are in linear increasing order of
@@ -734,7 +734,7 @@ static void journal_write_unlocked(struct closure *cl)
 	struct cache *ca = c->cache;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
-	unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) *
+	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
 		c->sb.block_size;
 
 	struct bio *bio;
@@ -754,7 +754,7 @@ static void journal_write_unlocked(struct closure *cl)
 		return;
 	}
 
-	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
+	c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
 
 	w->data->btree_level = c->root->level;
 
@@ -847,7 +847,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 		struct journal_write *w = c->journal.cur;
 
 		sectors = __set_blocks(w->data, w->data->keys + nkeys,
-				       block_bytes(c)) * c->sb.block_size;
+				       block_bytes(c->cache)) * c->sb.block_size;
 
 		if (sectors <= min_t(size_t,
 				     c->journal.blocks_free * c->sb.block_size,
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 7f54ae223644..d90319d15b59 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -99,7 +99,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned int u64s,
 	 * bch_data_insert_keys() will insert the keys created so far
 	 * and finish the rest when the keylist is empty.
 	 */
-	if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+	if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset))
 		return -ENOMEM;
 
 	return __bch_keylist_realloc(l, u64s);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b84487bd8a38..7f148489a2d8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1527,7 +1527,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 
 	kobject_init(&d->kobj, &bch_flash_dev_ktype);
 
-	if (bcache_device_init(d, block_bytes(c), u->sectors,
+	if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
 			NULL, &bcache_flash_ops))
 		goto err;
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index ac06c0bc3c0a..b9f524ab5cc8 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -714,7 +714,7 @@ SHOW(__bch_cache_set)
 	sysfs_print(synchronous,		CACHE_SYNC(&c->sb));
 	sysfs_print(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_hprint(bucket_size,		bucket_bytes(c));
-	sysfs_hprint(block_size,		block_bytes(c));
+	sysfs_hprint(block_size,		block_bytes(c->cache));
 	sysfs_print(tree_depth,			c->root->level);
 	sysfs_print(root_usage_percent,		bch_root_usage(c));
 
-- 
cgit v1.2.3


From 421cf1c573b18b0d770a4a47aefa3199eb5b94cf Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:50 +0800
Subject: bcache: remove useless alloc_bucket_pages()

Now no one uses alloc_bucket_pages() anymore, remove it from bcache.h.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7f148489a2d8..3a79790f5fab 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1831,9 +1831,6 @@ void bch_cache_set_unregister(struct cache_set *c)
 	bch_cache_set_stop(c);
 }
 
-#define alloc_bucket_pages(gfp, c)			\
-	((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
-
 #define alloc_meta_bucket_pages(gfp, sb)		\
 	((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
 
-- 
cgit v1.2.3


From 3c4fae29826c5fc588aba946189b95dd9fb5e4e9 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:51 +0800
Subject: bcache: remove useless bucket_pages()

It seems alloc_bucket_pages() is the only user of bucket_pages().
Considering alloc_bucket_pages() is removed from bcache code, it is safe
to remove the useless macro bucket_pages() now.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 29bec61cafbb..48a2585b6bbb 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -757,7 +757,6 @@ struct bbio {
 #define btree_default_blocks(c)						\
 	((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
 
-#define bucket_pages(c)		((c)->sb.bucket_size / PAGE_SECTORS)
 #define bucket_bytes(c)		((c)->sb.bucket_size << 9)
 #define block_bytes(ca)		((ca)->sb.block_size << 9)
 
-- 
cgit v1.2.3


From 63a96c05cd43ccf66cb671af039c7931a840efe8 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:52 +0800
Subject: bcache: only use bucket_bytes() on struct cache

Because struct cache_set and struct cache both have struct cache_sb,
macro bucket_bytes() currently are used on both of them. When removing
the embedded struct cache_sb from struct cache_set, this macro won't be
used on struct cache_set anymore.

This patch unifies all bucket_bytes() usage only on struct cache, this is
one of the preparation to remove the embedded struct cache_sb from
struct cache_set.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h | 2 +-
 drivers/md/bcache/sysfs.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 48a2585b6bbb..94d4baf4c405 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -757,7 +757,7 @@ struct bbio {
 #define btree_default_blocks(c)						\
 	((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
 
-#define bucket_bytes(c)		((c)->sb.bucket_size << 9)
+#define bucket_bytes(ca)	((ca)->sb.bucket_size << 9)
 #define block_bytes(ca)		((ca)->sb.block_size << 9)
 
 static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b9f524ab5cc8..4bfe98faadcc 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -713,7 +713,7 @@ SHOW(__bch_cache_set)
 
 	sysfs_print(synchronous,		CACHE_SYNC(&c->sb));
 	sysfs_print(journal_delay_ms,		c->journal_delay_ms);
-	sysfs_hprint(bucket_size,		bucket_bytes(c));
+	sysfs_hprint(bucket_size,		bucket_bytes(c->cache));
 	sysfs_hprint(block_size,		block_bytes(c->cache));
 	sysfs_print(tree_depth,			c->root->level);
 	sysfs_print(root_usage_percent,		bch_root_usage(c));
-- 
cgit v1.2.3


From 08a17828812adbe965b7af9a0e3e6a6024460cc8 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:53 +0800
Subject: bcache: don't check seq numbers in register_cache_set()

In order to update the partial super block of cache set, the seq numbers
of cache and cache set are checked in register_cache_set(). If cache's
seq number is larger than cache set's seq number, cache set must update
its partial super block from cache's super block. It is unncessary when
the embedded struct cache_sb is removed from struct cache set.

This patch removed the seq numbers checking from register_cache_set(),
because later there will be no such partial super block in struct cache
set, the cache set will directly reference in-memory super block from
struct cache. This is a preparation patch for removing embedded struct
cache_sb from struct cache_set.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3a79790f5fab..291a3d4a4591 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2175,21 +2175,6 @@ found:
 	    sysfs_create_link(&c->kobj, &ca->kobj, buf))
 		goto err;
 
-	/*
-	 * A special case is both ca->sb.seq and c->sb.seq are 0,
-	 * such condition happens on a new created cache device whose
-	 * super block is never flushed yet. In this case c->sb.version
-	 * and other members should be updated too, otherwise we will
-	 * have a mistaken super block version in cache set.
-	 */
-	if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
-		c->sb.version		= ca->sb.version;
-		memcpy(c->set_uuid, ca->sb.set_uuid, 16);
-		c->sb.flags             = ca->sb.flags;
-		c->sb.seq		= ca->sb.seq;
-		pr_debug("set version = %llu\n", c->sb.version);
-	}
-
 	kobject_get(&ca->kobj);
 	ca->set = c;
 	ca->set->cache = ca;
-- 
cgit v1.2.3


From ebaa1ac12b0c97337a5bad1d57334056791aaa88 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:54 +0800
Subject: bcache: remove can_attach_cache()

After removing the embedded struct cache_sb from struct cache_set, cache
set will directly reference the in-memory super block of struct cache.
It is unnecessary to compare block_size, bucket_size and nr_in_set from
the identical in-memory super block in can_attach_cache().

This is a preparation patch for latter removing cache_set->sb from
struct cache_set.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 291a3d4a4591..1450c06a0e6e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2127,13 +2127,6 @@ err:
 	return -EIO;
 }
 
-static bool can_attach_cache(struct cache *ca, struct cache_set *c)
-{
-	return ca->sb.block_size	== c->sb.block_size &&
-		ca->sb.bucket_size	== c->sb.bucket_size &&
-		ca->sb.nr_in_set	== c->sb.nr_in_set;
-}
-
 static const char *register_cache_set(struct cache *ca)
 {
 	char buf[12];
@@ -2145,9 +2138,6 @@ static const char *register_cache_set(struct cache *ca)
 			if (c->cache)
 				return "duplicate cache set member";
 
-			if (!can_attach_cache(ca, c))
-				return "cache sb does not match set";
-
 			if (!CACHE_SYNC(&ca->sb))
 				SET_CACHE_SYNC(&c->sb, false);
 
-- 
cgit v1.2.3


From 6f9414e0f6f35c7669dad5ac1a838ce323302f03 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:55 +0800
Subject: bcache: check and set sync status on cache's in-memory super block

Currently the cache's sync status is checked and set on cache set's in-
memory partial super block. After removing the embedded struct cache_sb
from cache set and reference cache's in-memory super block from struct
cache_set, the sync status can set and check directly on cache's super
block.

This patch checks and sets the cache sync status directly on cache's
in-memory super block. This is a preparation for later removing embedded
struct cache_sb from struct cache_set.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c   | 2 +-
 drivers/md/bcache/journal.c | 2 +-
 drivers/md/bcache/super.c   | 7 ++-----
 drivers/md/bcache/sysfs.c   | 6 +++---
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 1b8310992dd0..65fdbdeb5134 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -361,7 +361,7 @@ retry_invalidate:
 		 * new stuff to them:
 		 */
 		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
-		if (CACHE_SYNC(&ca->set->sb)) {
+		if (CACHE_SYNC(&ca->sb)) {
 			/*
 			 * This could deadlock if an allocation with a btree
 			 * node locked ever blocked - having the btree node
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ab0d06e4851d..cd46f33db507 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -915,7 +915,7 @@ atomic_t *bch_journal(struct cache_set *c,
 	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
 		return NULL;
 
-	if (!CACHE_SYNC(&c->sb))
+	if (!CACHE_SYNC(&c->cache->sb))
 		return NULL;
 
 	w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1450c06a0e6e..96323406fa6f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1969,7 +1969,7 @@ static int run_cache_set(struct cache_set *c)
 	c->nbuckets = ca->sb.nbuckets;
 	set_gc_sectors(c);
 
-	if (CACHE_SYNC(&c->sb)) {
+	if (CACHE_SYNC(&c->cache->sb)) {
 		struct bkey *k;
 		struct jset *j;
 
@@ -2092,7 +2092,7 @@ static int run_cache_set(struct cache_set *c)
 		 * everything is set up - fortunately journal entries won't be
 		 * written until the SET_CACHE_SYNC() here:
 		 */
-		SET_CACHE_SYNC(&c->sb, true);
+		SET_CACHE_SYNC(&c->cache->sb, true);
 
 		bch_journal_next(&c->journal);
 		bch_journal_meta(c, &cl);
@@ -2138,9 +2138,6 @@ static const char *register_cache_set(struct cache *ca)
 			if (c->cache)
 				return "duplicate cache set member";
 
-			if (!CACHE_SYNC(&ca->sb))
-				SET_CACHE_SYNC(&c->sb, false);
-
 			goto found;
 		}
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4bfe98faadcc..554e3afc9b68 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -711,7 +711,7 @@ SHOW(__bch_cache_set)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 
-	sysfs_print(synchronous,		CACHE_SYNC(&c->sb));
+	sysfs_print(synchronous,		CACHE_SYNC(&c->cache->sb));
 	sysfs_print(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_hprint(bucket_size,		bucket_bytes(c->cache));
 	sysfs_hprint(block_size,		block_bytes(c->cache));
@@ -812,8 +812,8 @@ STORE(__bch_cache_set)
 	if (attr == &sysfs_synchronous) {
 		bool sync = strtoul_or_return(buf);
 
-		if (sync != CACHE_SYNC(&c->sb)) {
-			SET_CACHE_SYNC(&c->sb, sync);
+		if (sync != CACHE_SYNC(&c->cache->sb)) {
+			SET_CACHE_SYNC(&c->cache->sb, sync);
 			bcache_write_super(c);
 		}
 	}
-- 
cgit v1.2.3


From 4a784266c6a75f375e08915b35e909df19eff17f Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 1 Oct 2020 14:50:56 +0800
Subject: bcache: remove embedded struct cache_sb from struct cache_set

Since bcache code was merged into mainline kerrnel, each cache set only
as one single cache in it. The multiple caches framework is here but the
code is far from completed. Considering the multiple copies of cached
data can also be stored on e.g. md raid1 devices, it is unnecessary to
support multiple caches in one cache set indeed.

The previous preparation patches fix the dependencies of explicitly
making a cache set only have single cache. Now we don't have to maintain
an embedded partial super block in struct cache_set, the in-memory super
block can be directly referenced from struct cache.

This patch removes the embedded struct cache_sb from struct cache_set,
and fixes all locations where the superb lock was referenced from this
removed super block by referencing the in-memory super block of struct
cache.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c     |  6 +++---
 drivers/md/bcache/bcache.h    |  4 +---
 drivers/md/bcache/btree.c     | 17 ++++++++--------
 drivers/md/bcache/btree.h     |  2 +-
 drivers/md/bcache/extents.c   |  6 +++---
 drivers/md/bcache/features.c  |  4 ++--
 drivers/md/bcache/io.c        |  2 +-
 drivers/md/bcache/journal.c   | 11 +++++-----
 drivers/md/bcache/request.c   |  4 ++--
 drivers/md/bcache/super.c     | 47 ++++++++++++++++---------------------------
 drivers/md/bcache/writeback.c |  2 +-
 11 files changed, 46 insertions(+), 59 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 65fdbdeb5134..8c371d5eef8e 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 {
 	struct cache *ca;
 	struct bucket *b;
-	unsigned long next = c->nbuckets * c->sb.bucket_size / 1024;
+	unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024;
 	int r;
 
 	atomic_sub(sectors, &c->rescale);
@@ -583,7 +583,7 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
 					   struct open_bucket, list);
 found:
 	if (!ret->sectors_free && KEY_PTRS(alloc)) {
-		ret->sectors_free = c->sb.bucket_size;
+		ret->sectors_free = c->cache->sb.bucket_size;
 		bkey_copy(&ret->key, alloc);
 		bkey_init(alloc);
 	}
@@ -677,7 +677,7 @@ bool bch_alloc_sectors(struct cache_set *c,
 				&PTR_CACHE(c, &b->key, i)->sectors_written);
 	}
 
-	if (b->sectors_free < c->sb.block_size)
+	if (b->sectors_free < c->cache->sb.block_size)
 		b->sectors_free = 0;
 
 	/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 94d4baf4c405..1d57f48307e6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -517,8 +517,6 @@ struct cache_set {
 	atomic_t		idle_counter;
 	atomic_t		at_max_writeback_rate;
 
-	struct cache_sb		sb;
-
 	struct cache		*cache;
 
 	struct bcache_device	**devices;
@@ -799,7 +797,7 @@ static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
 
 static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
 {
-	return s & (c->sb.bucket_size - 1);
+	return s & (c->cache->sb.bucket_size - 1);
 }
 
 static inline struct cache *PTR_CACHE(struct cache_set *c,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 1f3d4870a8b2..910df242c83d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -117,7 +117,7 @@ static void bch_btree_init_next(struct btree *b)
 
 	if (b->written < btree_blocks(b))
 		bch_bset_init_next(&b->keys, write_block(b),
-				   bset_magic(&b->c->sb));
+				   bset_magic(&b->c->cache->sb));
 
 }
 
@@ -155,7 +155,7 @@ void bch_btree_node_read_done(struct btree *b)
 	 * See the comment arount cache_set->fill_iter.
 	 */
 	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
-	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
+	iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
 	iter->used = 0;
 
 #ifdef CONFIG_BCACHE_DEBUG
@@ -178,7 +178,7 @@ void bch_btree_node_read_done(struct btree *b)
 			goto err;
 
 		err = "bad magic";
-		if (i->magic != bset_magic(&b->c->sb))
+		if (i->magic != bset_magic(&b->c->cache->sb))
 			goto err;
 
 		err = "bad checksum";
@@ -219,7 +219,7 @@ void bch_btree_node_read_done(struct btree *b)
 
 	if (b->written < btree_blocks(b))
 		bch_bset_init_next(&b->keys, write_block(b),
-				   bset_magic(&b->c->sb));
+				   bset_magic(&b->c->cache->sb));
 out:
 	mempool_free(iter, &b->c->fill_iter);
 	return;
@@ -423,7 +423,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)
 
 	do_btree_node_write(b);
 
-	atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->sb.block_size,
+	atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 
 	b->written += set_blocks(i, block_bytes(b->c->cache));
@@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c)
 	if (c->verify_data)
 		list_move(&c->verify_data->list, &c->btree_cache);
 
-	free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb)));
+	free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
 #endif
 
 	list_splice(&c->btree_cache_freeable,
@@ -785,7 +785,8 @@ int bch_btree_cache_alloc(struct cache_set *c)
 	mutex_init(&c->verify_lock);
 
 	c->verify_ondisk = (void *)
-		__get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb)));
+		__get_free_pages(GFP_KERNEL|__GFP_COMP,
+				 ilog2(meta_bucket_pages(&c->cache->sb)));
 	if (!c->verify_ondisk) {
 		/*
 		 * Don't worry about the mca_rereserve buckets
@@ -1108,7 +1109,7 @@ retry:
 	}
 
 	b->parent = parent;
-	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
+	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb));
 
 	mutex_unlock(&c->bucket_lock);
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 257969980c49..50482107134f 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -194,7 +194,7 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
 
 static inline void set_gc_sectors(struct cache_set *c)
 {
-	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
+	atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16);
 }
 
 void bkey_put(struct cache_set *c, struct bkey *k);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 9162af5bb6ec..f4658a1f37b8 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -54,7 +54,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
 			size_t bucket = PTR_BUCKET_NR(c, k, i);
 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
 
-			if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+			if (KEY_SIZE(k) + r > c->cache->sb.bucket_size ||
 			    bucket <  ca->sb.first_bucket ||
 			    bucket >= ca->sb.nbuckets)
 				return true;
@@ -75,7 +75,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
 			size_t bucket = PTR_BUCKET_NR(c, k, i);
 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
 
-			if (KEY_SIZE(k) + r > c->sb.bucket_size)
+			if (KEY_SIZE(k) + r > c->cache->sb.bucket_size)
 				return "bad, length too big";
 			if (bucket <  ca->sb.first_bucket)
 				return "bad, short offset";
@@ -136,7 +136,7 @@ static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
 		size_t n = PTR_BUCKET_NR(b->c, k, j);
 
 		pr_cont(" bucket %zu", n);
-		if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+		if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets)
 			pr_cont(" prio %i",
 				PTR_BUCKET(b->c, k, j)->prio);
 	}
diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c
index 4442df48d28c..6469223f0b77 100644
--- a/drivers/md/bcache/features.c
+++ b/drivers/md/bcache/features.c
@@ -30,7 +30,7 @@ static struct feature feature_list[] = {
 	for (f = &feature_list[0]; f->compat != 0; f++) {		\
 		if (f->compat != BCH_FEATURE_ ## type)			\
 			continue;					\
-		if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) {	\
+		if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) {	\
 			if (first) {					\
 				out += snprintf(out, buf + size - out,	\
 						"[");	\
@@ -44,7 +44,7 @@ static struct feature feature_list[] = {
 									\
 		out += snprintf(out, buf + size - out, "%s", f->string);\
 									\
-		if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask))	\
+		if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask))	\
 			out += snprintf(out, buf + size - out, "]");	\
 									\
 		first = false;						\
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index a14a445618b4..dad71a6b7889 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
 	struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
 	struct bio *bio = &b->bio;
 
-	bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb));
+	bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->cache->sb));
 
 	return bio;
 }
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index cd46f33db507..aefbdb7e003b 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -666,7 +666,7 @@ static void journal_reclaim(struct cache_set *c)
 
 	bkey_init(k);
 	SET_KEY_PTRS(k, 1);
-	c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+	c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits;
 
 out:
 	if (!journal_full(&c->journal))
@@ -735,7 +735,7 @@ static void journal_write_unlocked(struct closure *cl)
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
 	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
-		c->sb.block_size;
+		ca->sb.block_size;
 
 	struct bio *bio;
 	struct bio_list list;
@@ -762,7 +762,7 @@ static void journal_write_unlocked(struct closure *cl)
 	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
 
 	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
-	w->data->magic		= jset_magic(&c->sb);
+	w->data->magic		= jset_magic(&ca->sb);
 	w->data->version	= BCACHE_JSET_VERSION;
 	w->data->last_seq	= last_seq(&c->journal);
 	w->data->csum		= csum_set(w->data);
@@ -838,6 +838,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 	size_t sectors;
 	struct closure cl;
 	bool wait = false;
+	struct cache *ca = c->cache;
 
 	closure_init_stack(&cl);
 
@@ -847,10 +848,10 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 		struct journal_write *w = c->journal.cur;
 
 		sectors = __set_blocks(w->data, w->data->keys + nkeys,
-				       block_bytes(c->cache)) * c->sb.block_size;
+				       block_bytes(ca)) * ca->sb.block_size;
 
 		if (sectors <= min_t(size_t,
-				     c->journal.blocks_free * c->sb.block_size,
+				     c->journal.blocks_free * ca->sb.block_size,
 				     PAGE_SECTORS << JSET_BITS))
 			return w;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d90319d15b59..214326383145 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -394,8 +394,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 			goto skip;
 	}
 
-	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
-	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+	if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->cache->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io\n");
 		goto skip;
 	}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 96323406fa6f..46a00134a36a 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -350,16 +350,10 @@ void bcache_write_super(struct cache_set *c)
 	down(&c->sb_write_mutex);
 	closure_init(cl, &c->cl);
 
-	c->sb.seq++;
+	ca->sb.seq++;
 
-	if (c->sb.version > version)
-		version = c->sb.version;
-
-	ca->sb.version		= version;
-	ca->sb.seq		= c->sb.seq;
-	ca->sb.last_mount	= c->sb.last_mount;
-
-	SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
+	if (ca->sb.version < version)
+		ca->sb.version = version;
 
 	bio_init(bio, ca->sb_bv, 1);
 	bio_set_dev(bio, ca->bdev);
@@ -477,7 +471,7 @@ static int __uuid_write(struct cache_set *c)
 {
 	BKEY_PADDED(key) k;
 	struct closure cl;
-	struct cache *ca;
+	struct cache *ca = c->cache;
 	unsigned int size;
 
 	closure_init_stack(&cl);
@@ -486,13 +480,12 @@ static int __uuid_write(struct cache_set *c)
 	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
 		return 1;
 
-	size =  meta_bucket_pages(&c->sb) * PAGE_SECTORS;
+	size =  meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
 	SET_KEY_SIZE(&k.key, size);
 	uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
 	closure_sync(&cl);
 
 	/* Only one bucket used for uuid write */
-	ca = PTR_CACHE(c, &k.key, 0);
 	atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
 
 	bkey_copy(&c->uuid_bucket, &k.key);
@@ -1205,7 +1198,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 		return -EINVAL;
 	}
 
-	if (dc->sb.block_size < c->sb.block_size) {
+	if (dc->sb.block_size < c->cache->sb.block_size) {
 		/* Will die */
 		pr_err("Couldn't attach %s: block size less than set's block size\n",
 		       dc->backing_dev_name);
@@ -1663,6 +1656,9 @@ static void cache_set_free(struct closure *cl)
 	bch_journal_free(c);
 
 	mutex_lock(&bch_register_lock);
+	bch_bset_sort_state_free(&c->sort);
+	free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+
 	ca = c->cache;
 	if (ca) {
 		ca->set = NULL;
@@ -1670,8 +1666,6 @@ static void cache_set_free(struct closure *cl)
 		kobject_put(&ca->kobj);
 	}
 
-	bch_bset_sort_state_free(&c->sort);
-	free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb)));
 
 	if (c->moving_gc_wq)
 		destroy_workqueue(c->moving_gc_wq);
@@ -1837,6 +1831,7 @@ void bch_cache_set_unregister(struct cache_set *c)
 struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 {
 	int iter_size;
+	struct cache *ca = container_of(sb, struct cache, sb);
 	struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
 
 	if (!c)
@@ -1859,23 +1854,15 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	bch_cache_accounting_init(&c->accounting, &c->cl);
 
 	memcpy(c->set_uuid, sb->set_uuid, 16);
-	c->sb.block_size	= sb->block_size;
-	c->sb.bucket_size	= sb->bucket_size;
-	c->sb.nr_in_set		= sb->nr_in_set;
-	c->sb.last_mount	= sb->last_mount;
-	c->sb.version		= sb->version;
-	if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
-		c->sb.feature_compat = sb->feature_compat;
-		c->sb.feature_ro_compat = sb->feature_ro_compat;
-		c->sb.feature_incompat = sb->feature_incompat;
-	}
 
+	c->cache		= ca;
+	c->cache->set		= c;
 	c->bucket_bits		= ilog2(sb->bucket_size);
 	c->block_bits		= ilog2(sb->block_size);
-	c->nr_uuids		= meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry);
+	c->nr_uuids		= meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
 	c->devices_max_used	= 0;
 	atomic_set(&c->attached_dev_nr, 0);
-	c->btree_pages		= meta_bucket_pages(&c->sb);
+	c->btree_pages		= meta_bucket_pages(sb);
 	if (c->btree_pages > BTREE_MAX_PAGES)
 		c->btree_pages = max_t(int, c->btree_pages / 4,
 				       BTREE_MAX_PAGES);
@@ -1913,7 +1900,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 
 	if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
 			sizeof(struct bbio) +
-			sizeof(struct bio_vec) * meta_bucket_pages(&c->sb)))
+			sizeof(struct bio_vec) * meta_bucket_pages(sb)))
 		goto err;
 
 	if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
@@ -1923,7 +1910,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
 		goto err;
 
-	c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb);
+	c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
 	if (!c->uuids)
 		goto err;
 
@@ -2103,7 +2090,7 @@ static int run_cache_set(struct cache_set *c)
 		goto err;
 
 	closure_sync(&cl);
-	c->sb.last_mount = (u32)ktime_get_real_seconds();
+	c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
 	bcache_write_super(c);
 
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4f4ad6b3d43a..3c74996978da 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -35,7 +35,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
 	 * This is the size of the cache, minus the amount used for
 	 * flash-only devices
 	 */
-	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
+	uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
 				atomic_long_read(&c->flash_dev_dirty_sectors);
 
 	/*
-- 
cgit v1.2.3


From 99ba84c5de1e2f5b2b78e9e3bc89e50ec9bc0f29 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 2 Oct 2020 18:59:28 -0500
Subject: rsxx: Use fallthrough pseudo-keyword

Replace /* Fall through. */ comment with the new pseudo-keyword macro
fallthrough[1].

[1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 8799e3bab067..63f549889f87 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -439,7 +439,7 @@ static void card_state_change(struct rsxx_cardinfo *card,
 	case CARD_STATE_FAULT:
 		dev_crit(CARD_TO_DEV(card),
 			"Hardware Fault reported!\n");
-		/* Fall through. */
+		fallthrough;
 
 	/* Everything else, detach DMA interface if it's attached. */
 	case CARD_STATE_SHUTDOWN:
-- 
cgit v1.2.3


From f5ace5ef37b1e1de49882248656f35c45e041585 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 2 Oct 2020 18:10:33 -0500
Subject: block: scsi_ioctl: Avoid the use of one-element arrays

One-element arrays are being deprecated[1]. Replace the one-element array
with a simple object of type compat_caddr_t: 'compat_caddr_t unused'[2],
once it seems this field is actually never used.

Also, update struct cdrom_generic_command in UAPI by adding an
anonimous union to avoid using the one-element array _reserved_.

[1] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays
[2] https://github.com/KSPP/linux/issues/86

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/lkml/5f76f5d0.qJ4t%2FHWuRzSW7bTa%25lkp@intel.com/
Build-tested-by: kernel test robot <lkp@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/scsi_ioctl.c         | 6 +++---
 include/uapi/linux/cdrom.h | 5 ++++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 4421e61c1af1..227f489aeaa5 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -651,7 +651,7 @@ struct compat_cdrom_generic_command {
 	unsigned char	data_direction;
 	compat_int_t	quiet;
 	compat_int_t	timeout;
-	compat_caddr_t	reserved[1];
+	compat_caddr_t	unused;
 };
 #endif
 
@@ -673,7 +673,7 @@ static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc,
 			.data_direction	= cgc32.data_direction,
 			.quiet		= cgc32.quiet,
 			.timeout	= cgc32.timeout,
-			.reserved[0]	= compat_ptr(cgc32.reserved[0]),
+			.unused		= compat_ptr(cgc32.unused),
 		};
 		memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE);
 		return 0;
@@ -698,7 +698,7 @@ static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc,
 			.data_direction	= cgc->data_direction,
 			.quiet		= cgc->quiet,
 			.timeout	= cgc->timeout,
-			.reserved[0]	= (uintptr_t)(cgc->reserved[0]),
+			.unused		= (uintptr_t)(cgc->unused),
 		};
 		memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE);
 
diff --git a/include/uapi/linux/cdrom.h b/include/uapi/linux/cdrom.h
index 2817230148fd..6c34f6e2f1f7 100644
--- a/include/uapi/linux/cdrom.h
+++ b/include/uapi/linux/cdrom.h
@@ -289,7 +289,10 @@ struct cdrom_generic_command
 	unsigned char		data_direction;
 	int			quiet;
 	int			timeout;
-	void			__user *reserved[1];	/* unused, actually */
+	union {
+		void		__user *reserved[1];	/* unused, actually */
+		void            __user *unused;
+	};
 };
 
 /*
-- 
cgit v1.2.3


From bdb01301f3ea51a59eff252b06643fc1fe843e57 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Wed, 19 Aug 2020 23:20:30 +0800
Subject: scsi: Add host and host template flag 'host_tagset'

Add Host and host template flag 'host_tagset' so hostwide tagset can be
shared on multiple reply queues after the SCSI device's reply queue is
converted to blk-mq hw queue.

[jpg: Update comment on .can_queue and add Scsi_Host.host_tagset]

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Tested-by: Don Brace<don.brace@microsemi.com> #SCSI resv cmds patches used
Tested-by: Douglas Gilbert <dgilbert@interlog.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/hosts.c     | 1 +
 drivers/scsi/scsi_lib.c  | 2 ++
 include/scsi/scsi_host.h | 9 ++++++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 37d1c5565d90..2f162603876f 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -421,6 +421,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->cmd_per_lun = sht->cmd_per_lun;
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->no_write_same = sht->no_write_same;
+	shost->host_tagset = sht->host_tagset;
 
 	if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler)
 		shost->eh_deadline = -1;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7affaaf8b98e..a89478a0c588 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1891,6 +1891,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 	tag_set->flags |=
 		BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
 	tag_set->driver_data = shost;
+	if (shost->host_tagset)
+		tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
 
 	return blk_mq_alloc_tag_set(tag_set);
 }
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 46ef8cccc982..701f178b20ae 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -436,6 +436,9 @@ struct scsi_host_template {
 	/* True if the controller does not support WRITE SAME */
 	unsigned no_write_same:1;
 
+	/* True if the host uses host-wide tagspace */
+	unsigned host_tagset:1;
+
 	/*
 	 * Countdown for host blocking with no commands outstanding.
 	 */
@@ -603,7 +606,8 @@ struct Scsi_Host {
 	 *
 	 * Note: it is assumed that each hardware queue has a queue depth of
 	 * can_queue. In other words, the total queue depth per host
-	 * is nr_hw_queues * can_queue.
+	 * is nr_hw_queues * can_queue. However, for when host_tagset is set,
+	 * the total queue depth is can_queue.
 	 */
 	unsigned nr_hw_queues;
 	unsigned active_mode:2;
@@ -634,6 +638,9 @@ struct Scsi_Host {
 	/* The controller does not support WRITE SAME */
 	unsigned no_write_same:1;
 
+	/* True if the host uses host-wide tagspace */
+	unsigned host_tagset:1;
+
 	/* Host responded with short (<36 bytes) INQUIRY result */
 	unsigned short_inquiry:1;
 
-- 
cgit v1.2.3


From 64f1501bd2a9c8ab1df548e1ed224552f9c7ac71 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 19 Aug 2020 23:20:31 +0800
Subject: scsi: core: Show nr_hw_queues in sysfs

So that we don't use a value of 0 for when Scsi_Host.nr_hw_queues is unset,
use the tag_set->nr_hw_queues (which holds 1 for this case).

Signed-off-by: John Garry <john.garry@huawei.com>
Tested-by: Douglas Gilbert <dgilbert@interlog.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_sysfs.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 163dbcb741c1..d6e344fa33ad 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -393,6 +393,16 @@ show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL);
 
+static ssize_t
+show_nr_hw_queues(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct blk_mq_tag_set *tag_set = &shost->tag_set;
+
+	return snprintf(buf, 20, "%d\n", tag_set->nr_hw_queues);
+}
+static DEVICE_ATTR(nr_hw_queues, S_IRUGO, show_nr_hw_queues, NULL);
+
 static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_use_blk_mq.attr,
 	&dev_attr_unique_id.attr,
@@ -411,6 +421,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_prot_guard_type.attr,
 	&dev_attr_host_reset.attr,
 	&dev_attr_eh_deadline.attr,
+	&dev_attr_nr_hw_queues.attr,
 	NULL
 };
 
-- 
cgit v1.2.3


From 8d98416a55ebfecbe57fd26981decf6fcdf37b11 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 19 Aug 2020 23:20:32 +0800
Subject: scsi: hisi_sas: Switch v3 hw to MQ

Now that the block layer provides a shared tag, we can switch the driver
to expose all HW queues.

Signed-off-by: John Garry <john.garry@huawei.com>
Tested-by: Douglas Gilbert <dgilbert@interlog.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/hisi_sas/hisi_sas.h       |  3 +-
 drivers/scsi/hisi_sas/hisi_sas_main.c  | 36 +++++++-------
 drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 87 ++++++++++++++--------------------
 3 files changed, 56 insertions(+), 70 deletions(-)

diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h
index 2bdd64648ef0..e6acbf940712 100644
--- a/drivers/scsi/hisi_sas/hisi_sas.h
+++ b/drivers/scsi/hisi_sas/hisi_sas.h
@@ -8,6 +8,8 @@
 #define _HISI_SAS_H_
 
 #include <linux/acpi.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-pci.h>
 #include <linux/clk.h>
 #include <linux/debugfs.h>
 #include <linux/dmapool.h>
@@ -431,7 +433,6 @@ struct hisi_hba {
 	u32 intr_coal_count;	/* Interrupt count to coalesce */
 
 	int cq_nvecs;
-	unsigned int *reply_map;
 
 	/* bist */
 	enum sas_linkrate debugfs_bist_linkrate;
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index d9d21d23372e..a994c7b8d26f 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -417,6 +417,7 @@ static int hisi_sas_task_prep(struct sas_task *task,
 	struct device *dev = hisi_hba->dev;
 	int dlvry_queue_slot, dlvry_queue, rc, slot_idx;
 	int n_elem = 0, n_elem_dif = 0, n_elem_req = 0;
+	struct scsi_cmnd *scmd = NULL;
 	struct hisi_sas_dq *dq;
 	unsigned long flags;
 	int wr_q_index;
@@ -432,10 +433,23 @@ static int hisi_sas_task_prep(struct sas_task *task,
 		return -ECOMM;
 	}
 
-	if (hisi_hba->reply_map) {
-		int cpu = raw_smp_processor_id();
-		unsigned int dq_index = hisi_hba->reply_map[cpu];
+	if (task->uldd_task) {
+		struct ata_queued_cmd *qc;
 
+		if (dev_is_sata(device)) {
+			qc = task->uldd_task;
+			scmd = qc->scsicmd;
+		} else {
+			scmd = task->uldd_task;
+		}
+	}
+
+	if (scmd) {
+		unsigned int dq_index;
+		u32 blk_tag;
+
+		blk_tag = blk_mq_unique_tag(scmd->request);
+		dq_index = blk_mq_unique_tag_to_hwq(blk_tag);
 		*dq_pointer = dq = &hisi_hba->dq[dq_index];
 	} else {
 		*dq_pointer = dq = sas_dev->dq;
@@ -464,21 +478,9 @@ static int hisi_sas_task_prep(struct sas_task *task,
 
 	if (hisi_hba->hw->slot_index_alloc)
 		rc = hisi_hba->hw->slot_index_alloc(hisi_hba, device);
-	else {
-		struct scsi_cmnd *scsi_cmnd = NULL;
-
-		if (task->uldd_task) {
-			struct ata_queued_cmd *qc;
+	else
+		rc = hisi_sas_slot_index_alloc(hisi_hba, scmd);
 
-			if (dev_is_sata(device)) {
-				qc = task->uldd_task;
-				scsi_cmnd = qc->scsicmd;
-			} else {
-				scsi_cmnd = task->uldd_task;
-			}
-		}
-		rc  = hisi_sas_slot_index_alloc(hisi_hba, scsi_cmnd);
-	}
 	if (rc < 0)
 		goto err_out_dif_dma_unmap;
 
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
index 60adf5c32143..b6f75a1764df 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -2362,68 +2362,36 @@ static irqreturn_t cq_interrupt_v3_hw(int irq_no, void *p)
 	return IRQ_WAKE_THREAD;
 }
 
-static void setup_reply_map_v3_hw(struct hisi_hba *hisi_hba, int nvecs)
+static int interrupt_preinit_v3_hw(struct hisi_hba *hisi_hba)
 {
-	const struct cpumask *mask;
-	int queue, cpu;
+	int vectors;
+	int max_msi = HISI_SAS_MSI_COUNT_V3_HW, min_msi;
+	struct Scsi_Host *shost = hisi_hba->shost;
+	struct irq_affinity desc = {
+		.pre_vectors = BASE_VECTORS_V3_HW,
+	};
 
-	for (queue = 0; queue < nvecs; queue++) {
-		struct hisi_sas_cq *cq = &hisi_hba->cq[queue];
+	min_msi = MIN_AFFINE_VECTORS_V3_HW;
+	vectors = pci_alloc_irq_vectors_affinity(hisi_hba->pci_dev,
+						 min_msi, max_msi,
+						 PCI_IRQ_MSI |
+						 PCI_IRQ_AFFINITY,
+						 &desc);
+	if (vectors < 0)
+		return -ENOENT;
 
-		mask = pci_irq_get_affinity(hisi_hba->pci_dev, queue +
-					    BASE_VECTORS_V3_HW);
-		if (!mask)
-			goto fallback;
-		cq->irq_mask = mask;
-		for_each_cpu(cpu, mask)
-			hisi_hba->reply_map[cpu] = queue;
-	}
-	return;
 
-fallback:
-	for_each_possible_cpu(cpu)
-		hisi_hba->reply_map[cpu] = cpu % hisi_hba->queue_count;
-	/* Don't clean all CQ masks */
+	hisi_hba->cq_nvecs = vectors - BASE_VECTORS_V3_HW;
+	shost->nr_hw_queues = hisi_hba->cq_nvecs;
+
+	return 0;
 }
 
 static int interrupt_init_v3_hw(struct hisi_hba *hisi_hba)
 {
 	struct device *dev = hisi_hba->dev;
 	struct pci_dev *pdev = hisi_hba->pci_dev;
-	int vectors, rc, i;
-	int max_msi = HISI_SAS_MSI_COUNT_V3_HW, min_msi;
-
-	if (auto_affine_msi_experimental) {
-		struct irq_affinity desc = {
-			.pre_vectors = BASE_VECTORS_V3_HW,
-		};
-
-		dev_info(dev, "Enable MSI auto-affinity\n");
-
-		min_msi = MIN_AFFINE_VECTORS_V3_HW;
-
-		hisi_hba->reply_map = devm_kcalloc(dev, nr_cpu_ids,
-						   sizeof(unsigned int),
-						   GFP_KERNEL);
-		if (!hisi_hba->reply_map)
-			return -ENOMEM;
-		vectors = pci_alloc_irq_vectors_affinity(hisi_hba->pci_dev,
-							 min_msi, max_msi,
-							 PCI_IRQ_MSI |
-							 PCI_IRQ_AFFINITY,
-							 &desc);
-		if (vectors < 0)
-			return -ENOENT;
-		setup_reply_map_v3_hw(hisi_hba, vectors - BASE_VECTORS_V3_HW);
-	} else {
-		min_msi = max_msi;
-		vectors = pci_alloc_irq_vectors(hisi_hba->pci_dev, min_msi,
-						max_msi, PCI_IRQ_MSI);
-		if (vectors < 0)
-			return vectors;
-	}
-
-	hisi_hba->cq_nvecs = vectors - BASE_VECTORS_V3_HW;
+	int rc, i;
 
 	rc = devm_request_irq(dev, pci_irq_vector(pdev, 1),
 			      int_phy_up_down_bcast_v3_hw, 0,
@@ -3072,6 +3040,15 @@ static int debugfs_set_bist_v3_hw(struct hisi_hba *hisi_hba, bool enable)
 	return 0;
 }
 
+static int hisi_sas_map_queues(struct Scsi_Host *shost)
+{
+	struct hisi_hba *hisi_hba = shost_priv(shost);
+	struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT];
+
+	return blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev,
+				     BASE_VECTORS_V3_HW);
+}
+
 static struct scsi_host_template sht_v3_hw = {
 	.name			= DRV_NAME,
 	.proc_name		= DRV_NAME,
@@ -3082,6 +3059,7 @@ static struct scsi_host_template sht_v3_hw = {
 	.slave_configure	= hisi_sas_slave_configure,
 	.scan_finished		= hisi_sas_scan_finished,
 	.scan_start		= hisi_sas_scan_start,
+	.map_queues		= hisi_sas_map_queues,
 	.change_queue_depth	= sas_change_queue_depth,
 	.bios_param		= sas_bios_param,
 	.this_id		= -1,
@@ -3098,6 +3076,7 @@ static struct scsi_host_template sht_v3_hw = {
 	.shost_attrs		= host_attrs_v3_hw,
 	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,
 	.host_reset             = hisi_sas_host_reset,
+	.host_tagset		= 1,
 };
 
 static const struct hisi_sas_hw hisi_sas_v3_hw = {
@@ -3269,6 +3248,10 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (hisi_sas_debugfs_enable)
 		hisi_sas_debugfs_init(hisi_hba);
 
+	rc = interrupt_preinit_v3_hw(hisi_hba);
+	if (rc)
+		goto err_out_ha;
+	dev_err(dev, "%d hw queues\n", shost->nr_hw_queues);
 	rc = scsi_add_host(shost, dev);
 	if (rc)
 		goto err_out_ha;
-- 
cgit v1.2.3


From f7c4cdc712ab803f7cdd3d47025081831f457ecc Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 19 Aug 2020 23:20:33 +0800
Subject: scsi: scsi_debug: Support host tagset

When host_max_queue is set (> 0), set the Scsi_Host.host_tagset such that
blk-mq will use a hostwide tagset over all SCSI host submission queues.

This means that we may expose all submission queues and always use the hwq
chosen by blk-mq.

And since if sdebug_host_max_queue is set, sdebug_max_queue is fixed to the
same value, we can simplify how sdebug_driver_template.can_queue is set.

Signed-off-by: John Garry <john.garry@huawei.com>
Tested-by: Douglas Gilbert <dgilbert@interlog.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_debug.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 1ad7260d4758..a87e40aec11f 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -4696,19 +4696,14 @@ fini:
 static struct sdebug_queue *get_queue(struct scsi_cmnd *cmnd)
 {
 	u16 hwq;
+	u32 tag = blk_mq_unique_tag(cmnd->request);
 
-	if (sdebug_host_max_queue) {
-		/* Provide a simple method to choose the hwq */
-		hwq = smp_processor_id() % submit_queues;
-	} else {
-		u32 tag = blk_mq_unique_tag(cmnd->request);
+	hwq = blk_mq_unique_tag_to_hwq(tag);
 
-		hwq = blk_mq_unique_tag_to_hwq(tag);
+	pr_debug("tag=%#x, hwq=%d\n", tag, hwq);
+	if (WARN_ON_ONCE(hwq >= submit_queues))
+		hwq = 0;
 
-		pr_debug("tag=%#x, hwq=%d\n", tag, hwq);
-		if (WARN_ON_ONCE(hwq >= submit_queues))
-			hwq = 0;
-	}
 	return sdebug_q_arr + hwq;
 }
 
@@ -7347,10 +7342,7 @@ static int sdebug_driver_probe(struct device *dev)
 
 	sdbg_host = to_sdebug_host(dev);
 
-	if (sdebug_host_max_queue)
-		sdebug_driver_template.can_queue = sdebug_host_max_queue;
-	else
-		sdebug_driver_template.can_queue = sdebug_max_queue;
+	sdebug_driver_template.can_queue = sdebug_max_queue;
 	if (!sdebug_clustering)
 		sdebug_driver_template.dma_boundary = PAGE_SIZE - 1;
 
@@ -7367,11 +7359,11 @@ static int sdebug_driver_probe(struct device *dev)
 	}
 	/*
 	 * Decide whether to tell scsi subsystem that we want mq. The
-	 * following should give the same answer for each host. If the host
-	 * has a limit of hostwide max commands, then do not set.
+	 * following should give the same answer for each host.
 	 */
-	if (!sdebug_host_max_queue)
-		hpnt->nr_hw_queues = submit_queues;
+	hpnt->nr_hw_queues = submit_queues;
+	if (sdebug_host_max_queue)
+		hpnt->host_tagset = 1;
 
 	sdbg_host->shost = hpnt;
 	*((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host;
-- 
cgit v1.2.3


From 103fbf8e4020845e4fcf63819288cedb092a3c91 Mon Sep 17 00:00:00 2001
From: Kashyap Desai <kashyap.desai@broadcom.com>
Date: Wed, 19 Aug 2020 23:20:35 +0800
Subject: scsi: megaraid_sas: Added support for shared host tagset for
 cpuhotplug

Fusion adapters can steer completions to individual queues, and
we now have support for shared host-wide tags.
So we can enable multiqueue support for fusion adapters.

Once driver enable shared host-wide tags, cpu hotplug feature is also
supported as it was enabled using below patchsets -
commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are
offline")

Currently driver has provision to disable host-wide tags using
"host_tagset_enable" module parameter.

Once we do not have any major performance regression using host-wide
tags, we will drop the hand-crafted interrupt affinity settings.

Performance is also meeting the expecatation - (used both none and
mq-deadline scheduler)
24 Drive SSD on Aero with/without this patch can get 3.1M IOPs
3 VDs consist of 8 SAS SSD on Aero with/without this patch can get 3.1M
IOPs.

Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com>
Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Tested-by: Douglas Gilbert <dgilbert@interlog.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/megaraid/megaraid_sas_base.c   | 39 +++++++++++++++++++++++++++++
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 29 +++++++++++----------
 2 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 2b7e7b5f38ed..020270ce790b 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -37,6 +37,7 @@
 #include <linux/poll.h>
 #include <linux/vmalloc.h>
 #include <linux/irq_poll.h>
+#include <linux/blk-mq-pci.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd;
 module_param(enable_sdev_max_qd, int, 0444);
 MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as can_queue. Default: 0");
 
+int host_tagset_enable = 1;
+module_param(host_tagset_enable, int, 0444);
+MODULE_PARM_DESC(host_tagset_enable, "Shared host tagset enable/disable Default: enable(1)");
+
 MODULE_LICENSE("GPL");
 MODULE_VERSION(MEGASAS_VERSION);
 MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com");
@@ -3119,6 +3124,19 @@ megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev,
 	return 0;
 }
 
+static int megasas_map_queues(struct Scsi_Host *shost)
+{
+	struct megasas_instance *instance;
+
+	instance = (struct megasas_instance *)shost->hostdata;
+
+	if (shost->nr_hw_queues == 1)
+		return 0;
+
+	return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
+			instance->pdev, instance->low_latency_index_start);
+}
+
 static void megasas_aen_polling(struct work_struct *work);
 
 /**
@@ -3427,6 +3445,7 @@ static struct scsi_host_template megasas_template = {
 	.eh_timed_out = megasas_reset_timer,
 	.shost_attrs = megaraid_host_attrs,
 	.bios_param = megasas_bios_param,
+	.map_queues = megasas_map_queues,
 	.change_queue_depth = scsi_change_queue_depth,
 	.max_segment_size = 0xffffffff,
 };
@@ -6808,6 +6827,26 @@ static int megasas_io_attach(struct megasas_instance *instance)
 	host->max_lun = MEGASAS_MAX_LUN;
 	host->max_cmd_len = 16;
 
+	/* Use shared host tagset only for fusion adaptors
+	 * if there are managed interrupts (smp affinity enabled case).
+	 * Single msix_vectors in kdump, so shared host tag is also disabled.
+	 */
+
+	host->host_tagset = 0;
+	host->nr_hw_queues = 1;
+
+	if ((instance->adapter_type != MFI_SERIES) &&
+		(instance->msix_vectors > instance->low_latency_index_start) &&
+		host_tagset_enable &&
+		instance->smp_affinity_enable) {
+		host->host_tagset = 1;
+		host->nr_hw_queues = instance->msix_vectors -
+			instance->low_latency_index_start;
+	}
+
+	dev_info(&instance->pdev->dev,
+		"Max firmware commands: %d shared with nr_hw_queues = %d\n",
+		instance->max_fw_cmds, host->nr_hw_queues);
 	/*
 	 * Notify the mid-layer about the new controller
 	 */
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index b0c01cf0428f..fd607287608e 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -359,24 +359,29 @@ megasas_get_msix_index(struct megasas_instance *instance,
 {
 	int sdev_busy;
 
-	/* nr_hw_queue = 1 for MegaRAID */
-	struct blk_mq_hw_ctx *hctx =
-		scmd->device->request_queue->queue_hw_ctx[0];
-
-	sdev_busy = atomic_read(&hctx->nr_active);
+	/* TBD - if sml remove device_busy in future, driver
+	 * should track counter in internal structure.
+	 */
+	sdev_busy = atomic_read(&scmd->device->device_busy);
 
 	if (instance->perf_mode == MR_BALANCED_PERF_MODE &&
-	    sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH))
+	    sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) {
 		cmd->request_desc->SCSIIO.MSIxIndex =
 			mega_mod64((atomic64_add_return(1, &instance->high_iops_outstanding) /
 					MR_HIGH_IOPS_BATCH_COUNT), instance->low_latency_index_start);
-	else if (instance->msix_load_balance)
+	} else if (instance->msix_load_balance) {
 		cmd->request_desc->SCSIIO.MSIxIndex =
 			(mega_mod64(atomic64_add_return(1, &instance->total_io_count),
 				instance->msix_vectors));
-	else
+	} else if (instance->host->nr_hw_queues > 1) {
+		u32 tag = blk_mq_unique_tag(scmd->request);
+
+		cmd->request_desc->SCSIIO.MSIxIndex = blk_mq_unique_tag_to_hwq(tag) +
+			instance->low_latency_index_start;
+	} else {
 		cmd->request_desc->SCSIIO.MSIxIndex =
 			instance->reply_map[raw_smp_processor_id()];
+	}
 }
 
 /**
@@ -956,9 +961,6 @@ megasas_alloc_cmds_fusion(struct megasas_instance *instance)
 	if (megasas_alloc_cmdlist_fusion(instance))
 		goto fail_exit;
 
-	dev_info(&instance->pdev->dev, "Configured max firmware commands: %d\n",
-		 instance->max_fw_cmds);
-
 	/* The first 256 bytes (SMID 0) is not used. Don't add to the cmd list */
 	io_req_base = fusion->io_request_frames + MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE;
 	io_req_base_phys = fusion->io_request_frames_phys + MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE;
@@ -1102,8 +1104,9 @@ megasas_ioc_init_fusion(struct megasas_instance *instance)
 		MR_HIGH_IOPS_QUEUE_COUNT) && cur_intr_coalescing)
 		instance->perf_mode = MR_BALANCED_PERF_MODE;
 
-	dev_info(&instance->pdev->dev, "Performance mode :%s\n",
-		MEGASAS_PERF_MODE_2STR(instance->perf_mode));
+	dev_info(&instance->pdev->dev, "Performance mode :%s (latency index = %d)\n",
+		MEGASAS_PERF_MODE_2STR(instance->perf_mode),
+		instance->low_latency_index_start);
 
 	instance->fw_sync_cache_support = (scratch_pad_1 &
 		MR_CAN_HANDLE_SYNC_CACHE_OFFSET) ? 1 : 0;
-- 
cgit v1.2.3


From 1401fcc4e3da97c44dcc7cbf538c07e24768d791 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 29 Sep 2020 21:24:17 -0700
Subject: nvme-loop: don't put ctrl on nvme_init_ctrl error

The function nvme_init_ctrl() gets the ctrl reference & when it fails it
does put the ctrl reference in the error unwind code.

When creating loop ctrl in nvme_loop_create_ctrl() if nvme_init_ctrl()
returns non zero (i.e. error) value it jumps to the "out_put_ctrl" label
which calls nvme_put_ctrl(), that will lead to douple ctrl put in error
unwind path.

Update nvme_loop_create_ctrl() such that this patch removes the
"out_put_ctrl" label, add a new "out" label after nvme_put_ctrl() in
error unwind path and jump to newly added label when nvme_init_ctrl()
call retuns an error.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/loop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0d6008cf66a2..f6d81239be21 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -579,7 +579,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
 	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
 				0 /* no quirks, we're perfect! */);
 	if (ret)
-		goto out_put_ctrl;
+		goto out;
 
 	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
 		WARN_ON_ONCE(1);
@@ -635,8 +635,8 @@ out_free_queues:
 	kfree(ctrl->queues);
 out_uninit_ctrl:
 	nvme_uninit_ctrl(&ctrl->ctrl);
-out_put_ctrl:
 	nvme_put_ctrl(&ctrl->ctrl);
+out:
 	if (ret > 0)
 		ret = -EIO;
 	return ERR_PTR(ret);
-- 
cgit v1.2.3


From 6fcd669514794da08ce6bfa272b6ec9b33cb543d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Aug 2020 09:56:58 +0200
Subject: block: optimize blk_queue_zoned_model for !CONFIG_BLK_DEV_ZONED

Always return BLK_ZONED_NONE if zoned device support is not enabled.
This allows various compiler optimizations including the dead code
elimination that we so like for avoiding ifdefs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 include/linux/blkdev.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8e77f12de522..1b81b2766858 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -692,7 +692,9 @@ static inline bool queue_is_mq(struct request_queue *q)
 static inline enum blk_zoned_model
 blk_queue_zoned_model(struct request_queue *q)
 {
-	return q->limits.zoned;
+	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+		return q->limits.zoned;
+	return BLK_ZONED_NONE;
 }
 
 static inline bool blk_queue_is_zoned(struct request_queue *q)
-- 
cgit v1.2.3


From 7fad20dd7c0ab1d2c224755a574576be25f13e03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Aug 2020 09:31:36 +0200
Subject: nvme: fix initialization of the zone bitmaps

The removal of the ->revalidate_disk method broke the initialization of
the zone bitmaps, as nvme_revalidate_disk now never gets called during
initialization.

Move the zone related code from nvme_revalidate_disk into a new helper in
zns.c, and call it from nvme_alloc_ns in addition to nvme_validate_ns to
ensure the zone bitmaps are initialized during probe.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 28 +++++-----------------------
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/zns.c  | 11 +++++++++++
 3 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0a985c601c62..400d995f95fe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2202,28 +2202,6 @@ out:
 	return ret;
 }
 
-static int nvme_revalidate_disk(struct gendisk *disk)
-{
-	int ret;
-
-	ret = _nvme_revalidate_disk(disk);
-	if (ret)
-		return ret;
-
-#ifdef CONFIG_BLK_DEV_ZONED
-	if (blk_queue_is_zoned(disk->queue)) {
-		struct nvme_ns *ns = disk->private_data;
-		struct nvme_ctrl *ctrl = ns->ctrl;
-
-		ret = blk_revalidate_disk_zones(disk, NULL);
-		if (!ret)
-			blk_queue_max_zone_append_sectors(disk->queue,
-							  ctrl->max_zone_append);
-	}
-#endif
-	return ret;
-}
-
 static char nvme_pr_type(enum pr_type type)
 {
 	switch (type) {
@@ -3958,6 +3936,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	if (__nvme_revalidate_disk(disk, id))
 		goto out_put_disk;
+	if (blk_queue_is_zoned(ns->queue) && nvme_revalidate_zones(ns))
+		goto out_put_disk;
 
 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
 		ret = nvme_nvm_register(ns, disk_name, node);
@@ -4052,7 +4032,9 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		return;
 	}
 
-	ret = nvme_revalidate_disk(ns->disk);
+	ret = _nvme_revalidate_disk(ns->disk);
+	if (!ret && blk_queue_is_zoned(ns->queue))
+		ret = nvme_revalidate_zones(ns);
 	revalidate_disk_size(ns->disk, ret == 0);
 	if (ret)
 		nvme_ns_remove(ns);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1096ef1f6aa2..6cbbd1597ae6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -758,6 +758,7 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 }
 #endif /* CONFIG_NVME_MULTIPATH */
 
+int nvme_revalidate_zones(struct nvme_ns *ns);
 #ifdef CONFIG_BLK_DEV_ZONED
 int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
 			  unsigned lbaf);
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 53efecb67898..56d708017d06 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -7,6 +7,17 @@
 #include <linux/vmalloc.h>
 #include "nvme.h"
 
+int nvme_revalidate_zones(struct nvme_ns *ns)
+{
+	struct request_queue *q = ns->queue;
+	int ret;
+
+	ret = blk_revalidate_disk_zones(ns->disk, NULL);
+	if (!ret)
+		blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
+	return ret;
+}
+
 static int nvme_set_max_append(struct nvme_ctrl *ctrl)
 {
 	struct nvme_command c = { };
-- 
cgit v1.2.3


From d525c3c023221619748d1e758e5a26daa775f822 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Aug 2020 14:02:18 +0200
Subject: nvme: remove the disk argument to nvme_update_zone_info

The queue can trivially be derived from the nvme_ns structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 2 +-
 drivers/nvme/host/nvme.h | 8 ++------
 drivers/nvme/host/zns.c  | 5 ++---
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 400d995f95fe..d4b503208497 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2095,7 +2095,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	case NVME_CSI_NVM:
 		break;
 	case NVME_CSI_ZNS:
-		ret = nvme_update_zone_info(disk, ns, lbaf);
+		ret = nvme_update_zone_info(ns, lbaf);
 		if (ret) {
 			dev_warn(ctrl->device,
 				"failed to add zoned namespace:%u ret:%d\n",
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6cbbd1597ae6..566776100126 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -760,9 +760,7 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 
 int nvme_revalidate_zones(struct nvme_ns *ns);
 #ifdef CONFIG_BLK_DEV_ZONED
-int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
-			  unsigned lbaf);
-
+int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
 int nvme_report_zones(struct gendisk *disk, sector_t sector,
 		      unsigned int nr_zones, report_zones_cb cb, void *data);
 
@@ -779,9 +777,7 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
 	return BLK_STS_NOTSUPP;
 }
 
-static inline int nvme_update_zone_info(struct gendisk *disk,
-					struct nvme_ns *ns,
-					unsigned lbaf)
+static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 {
 	dev_warn(ns->ctrl->device,
 		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 56d708017d06..67e87e9f306f 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -46,11 +46,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
 	return 0;
 }
 
-int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
-			  unsigned lbaf)
+int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 {
 	struct nvme_effects_log *log = ns->head->effects;
-	struct request_queue *q = disk->queue;
+	struct request_queue *q = ns->queue;
 	struct nvme_command c = { };
 	struct nvme_id_ns_zns *id;
 	int status;
-- 
cgit v1.2.3


From eba9bcf7fef0c4a880245b9a261186a879742355 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 09:42:17 +0200
Subject: nvme: rename nvme_validate_ns to nvme_validate_or_alloc_ns

Use a slightly more descriptive name to enable reusing nvme_validate_ns
in the next patch for a lower level function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d4b503208497..c5f2615bf583 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4021,7 +4021,7 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
 	}
 }
 
-static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns;
 	int ret;
@@ -4083,7 +4083,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
 
 			if (!nsid)	/* end of the list? */
 				goto out;
-			nvme_validate_ns(ctrl, nsid);
+			nvme_validate_or_alloc_ns(ctrl, nsid);
 			while (++prev < nsid)
 				nvme_ns_remove_by_nsid(ctrl, prev);
 		}
@@ -4106,7 +4106,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
 	kfree(id);
 
 	for (i = 1; i <= nn; i++)
-		nvme_validate_ns(ctrl, i);
+		nvme_validate_or_alloc_ns(ctrl, i);
 
 	nvme_remove_invalid_namespaces(ctrl, nn);
 }
-- 
cgit v1.2.3


From 2124f096fb4521d8efdf2412e9102d475ff5cd36 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 10:25:54 +0200
Subject: nvme: rename _nvme_revalidate_disk

Rename _nvme_revalidate_disk to nvme_validate_ns to better describe
what the function does, and pass the struct nvme_ns instead of the
gendisk to better match the call chain.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c5f2615bf583..c04043a94e64 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 static struct class *nvme_subsys_class;
 
-static int _nvme_revalidate_disk(struct gendisk *disk);
+static int nvme_validate_ns(struct nvme_ns *ns);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 					   unsigned nsid);
@@ -1026,7 +1026,7 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		if (_nvme_revalidate_disk(ns->disk))
+		if (nvme_validate_ns(ns))
 			nvme_set_queue_dying(ns);
 		else if (blk_queue_is_zoned(ns->disk->queue)) {
 			/*
@@ -2154,16 +2154,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	return 0;
 }
 
-static int _nvme_revalidate_disk(struct gendisk *disk)
+static int nvme_validate_ns(struct nvme_ns *ns)
 {
-	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_id_ns *id;
 	struct nvme_ns_ids ids;
 	int ret = 0;
 
 	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
-		set_capacity(disk, 0);
+		set_capacity(ns->disk, 0);
 		return -ENODEV;
 	}
 
@@ -2187,7 +2186,7 @@ static int _nvme_revalidate_disk(struct gendisk *disk)
 		goto free_id;
 	}
 
-	ret = __nvme_revalidate_disk(disk, id);
+	ret = __nvme_revalidate_disk(ns->disk, id);
 free_id:
 	kfree(id);
 out:
@@ -4032,7 +4031,7 @@ static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		return;
 	}
 
-	ret = _nvme_revalidate_disk(ns->disk);
+	ret = nvme_validate_ns(ns);
 	if (!ret && blk_queue_is_zoned(ns->queue))
 		ret = nvme_revalidate_zones(ns);
 	revalidate_disk_size(ns->disk, ret == 0);
-- 
cgit v1.2.3


From 81382f1730d24a60e2b0499592e64e6e640b1871 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:14:20 +0200
Subject: nvme: rename __nvme_revalidate_disk

Rename __nvme_revalidate_disk to nvme_update_ns_info and pass a
namespace instead of the gendisk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c04043a94e64..fede487f6e04 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2076,10 +2076,9 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
 	blk_queue_chunk_sectors(ns->queue, iob);
 }
 
-static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
+static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
 	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
-	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	int ret;
 
@@ -2141,7 +2140,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	}
 
 	nvme_set_chunk_sectors(ns, id);
-	nvme_update_disk_info(disk, ns, id);
+	nvme_update_disk_info(ns->disk, ns, id);
 #ifdef CONFIG_NVME_MULTIPATH
 	if (ns->head->disk) {
 		nvme_update_disk_info(ns->head->disk, ns, id);
@@ -2186,7 +2185,7 @@ static int nvme_validate_ns(struct nvme_ns *ns)
 		goto free_id;
 	}
 
-	ret = __nvme_revalidate_disk(ns->disk, id);
+	ret = nvme_update_ns_info(ns, id);
 free_id:
 	kfree(id);
 out:
@@ -3933,7 +3932,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
 	ns->disk = disk;
 
-	if (__nvme_revalidate_disk(disk, id))
+	if (nvme_update_ns_info(ns, id))
 		goto out_put_disk;
 	if (blk_queue_is_zoned(ns->queue) && nvme_revalidate_zones(ns))
 		goto out_put_disk;
-- 
cgit v1.2.3


From b8b8cd013327327e757529a0725a8c754c7890e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:33:19 +0200
Subject: nvme: lift the check for an unallocated namespace into
 nvme_identify_ns

Move the check from the two callers into the common helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fede487f6e04..7b1423c7e7fc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1381,9 +1381,16 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl,
 	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
-		kfree(*id);
+		goto out_free_id;
 	}
 
+	error = -ENODEV;
+	if ((*id)->ncap == 0) /* namespace not allocated or attached */
+		goto out_free_id;
+	return 0;
+
+out_free_id:
+	kfree(*id);
 	return error;
 }
 
@@ -2169,11 +2176,6 @@ static int nvme_validate_ns(struct nvme_ns *ns)
 	if (ret)
 		goto out;
 
-	if (id->ncap == 0) {
-		ret = -ENODEV;
-		goto free_id;
-	}
-
 	ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
 	if (ret)
 		goto free_id;
@@ -3913,9 +3915,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (ret)
 		goto out_free_queue;
 
-	if (id->ncap == 0)	/* no namespace (legacy quirk) */
-		goto out_free_id;
-
 	ret = nvme_init_ns_head(ns, nsid, id);
 	if (ret)
 		goto out_free_id;
-- 
cgit v1.2.3


From fab72f5a046883ae5851c95b7c6e7a08f984e391 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:34:04 +0200
Subject: nvme: call nvme_identify_ns as the first thing in nvme_alloc_ns_block

Check if the namespace actually exists as the very first thing and don't
bother with any extra work if not.  This should speed up and simplify
the sequential scanning for NVMe 1.0 devices.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7b1423c7e7fc..4a5c4d45755b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3887,9 +3887,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	char disk_name[DISK_NAME_LEN];
 	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
 
+	if (nvme_identify_ns(ctrl, nsid, &id))
+		return;
+
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
-		return;
+		goto out_free_id;
 
 	ns->queue = blk_mq_init_queue(ctrl->tagset);
 	if (IS_ERR(ns->queue))
@@ -3911,13 +3914,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	nvme_set_queue_limits(ctrl, ns->queue);
 
-	ret = nvme_identify_ns(ctrl, nsid, &id);
-	if (ret)
-		goto out_free_queue;
-
 	ret = nvme_init_ns_head(ns, nsid, id);
 	if (ret)
-		goto out_free_id;
+		goto out_free_queue;
 	nvme_set_disk_name(disk_name, ns, ctrl, &flags);
 
 	disk = alloc_disk_node(0, node);
@@ -3968,12 +3967,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		list_del_init(&ns->head->entry);
 	mutex_unlock(&ctrl->subsys->lock);
 	nvme_put_ns_head(ns->head);
- out_free_id:
-	kfree(id);
  out_free_queue:
 	blk_cleanup_queue(ns->queue);
  out_free_ns:
 	kfree(ns);
+ out_free_id:
+	kfree(id);
 }
 
 static void nvme_ns_remove(struct nvme_ns *ns)
-- 
cgit v1.2.3


From d4609ea8b3d3fb3423f35805843a82774cb4ef2f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Sep 2020 07:19:13 +0200
Subject: nvme: factor out a nvme_configure_metadata helper

Factor out a helper from nvme_update_ns_info that configures the
per-namespaces metadata and PI settings.  Also make sure the helpers
clear the flags explicitly instead of all of ->features to allow for
potentially reusing ->features for future non-metadata flags.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 78 +++++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4a5c4d45755b..443aba9c9fdd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1966,6 +1966,50 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return 0;
 }
 
+static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+
+	/*
+	 * The PI implementation requires the metadata size to be equal to the
+	 * t10 pi tuple size.
+	 */
+	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+	if (ns->ms == sizeof(struct t10_pi_tuple))
+		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+	else
+		ns->pi_type = 0;
+
+	ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+	if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+		return 0;
+	if (ctrl->ops->flags & NVME_F_FABRICS) {
+		/*
+		 * The NVMe over Fabrics specification only supports metadata as
+		 * part of the extended data LBA.  We rely on HCA/HBA support to
+		 * remap the separate metadata buffer from the block layer.
+		 */
+		if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
+			return -EINVAL;
+		if (ctrl->max_integrity_segments)
+			ns->features |=
+				(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+	} else {
+		/*
+		 * For PCIe controllers, we can't easily remap the separate
+		 * metadata buffer from the block layer and thus require a
+		 * separate metadata buffer for block layer metadata/PI support.
+		 * We allow extended LBAs for the passthrough interface, though.
+		 */
+		if (id->flbas & NVME_NS_FLBAS_META_EXT)
+			ns->features |= NVME_NS_EXT_LBAS;
+		else
+			ns->features |= NVME_NS_METADATA_SUPPORTED;
+	}
+
+	return 0;
+}
+
 static void nvme_update_disk_info(struct gendisk *disk,
 		struct nvme_ns *ns, struct nvme_id_ns *id)
 {
@@ -2115,37 +2159,9 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 		return -ENODEV;
 	}
 
-	ns->features = 0;
-	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
-	/* the PI implementation requires metadata equal t10 pi tuple size */
-	if (ns->ms == sizeof(struct t10_pi_tuple))
-		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
-	else
-		ns->pi_type = 0;
-
-	if (ns->ms) {
-		/*
-		 * For PCIe only the separate metadata pointer is supported,
-		 * as the block layer supplies metadata in a separate bio_vec
-		 * chain. For Fabrics, only metadata as part of extended data
-		 * LBA is supported on the wire per the Fabrics specification,
-		 * but the HBA/HCA will do the remapping from the separate
-		 * metadata buffers for us.
-		 */
-		if (id->flbas & NVME_NS_FLBAS_META_EXT) {
-			ns->features |= NVME_NS_EXT_LBAS;
-			if ((ctrl->ops->flags & NVME_F_FABRICS) &&
-			    (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) &&
-			    ctrl->max_integrity_segments)
-				ns->features |= NVME_NS_METADATA_SUPPORTED;
-		} else {
-			if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS))
-				return -EINVAL;
-			if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
-				ns->features |= NVME_NS_METADATA_SUPPORTED;
-		}
-	}
-
+	ret = nvme_configure_metadata(ns, id);
+	if (ret)
+		return ret;
 	nvme_set_chunk_sectors(ns, id);
 	nvme_update_disk_info(ns->disk, ns, id);
 #ifdef CONFIG_NVME_MULTIPATH
-- 
cgit v1.2.3


From f9d5f4579feafa721dba2f350fc064a1852c6f8c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:11:42 +0200
Subject: nvme: freeze the queue over ->lba_shift updates

Ensure that there can't be any I/O in flight went we change the disk
geometry in nvme_update_ns_info, most notable the LBA size by lifting
the queue free from nvme_update_disk_info into the caller

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 443aba9c9fdd..82cd03c0ba21 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2021,7 +2021,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
 		/* unsupported block size, set capacity to 0 later */
 		bs = (1 << 9);
 	}
-	blk_mq_freeze_queue(disk->queue);
+
 	blk_integrity_unregister(disk);
 
 	atomic_bs = phys_bs = bs;
@@ -2086,8 +2086,6 @@ static void nvme_update_disk_info(struct gendisk *disk,
 		set_disk_ro(disk, true);
 	else
 		set_disk_ro(disk, false);
-
-	blk_mq_unfreeze_queue(disk->queue);
 }
 
 static inline bool nvme_first_scan(struct gendisk *disk)
@@ -2133,6 +2131,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	int ret;
 
+	blk_mq_freeze_queue(ns->disk->queue);
 	/*
 	 * If identify namespace failed, use default 512 byte block size so
 	 * block layer can use before failing read/write for 0 capacity.
@@ -2150,30 +2149,39 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 			dev_warn(ctrl->device,
 				"failed to add zoned namespace:%u ret:%d\n",
 				ns->head->ns_id, ret);
-			return ret;
+			goto out_unfreeze;
 		}
 		break;
 	default:
 		dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
 			ns->head->ids.csi, ns->head->ns_id);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto out_unfreeze;
 	}
 
 	ret = nvme_configure_metadata(ns, id);
 	if (ret)
-		return ret;
+		goto out_unfreeze;
 	nvme_set_chunk_sectors(ns, id);
 	nvme_update_disk_info(ns->disk, ns, id);
+	blk_mq_unfreeze_queue(ns->disk->queue);
+
 #ifdef CONFIG_NVME_MULTIPATH
 	if (ns->head->disk) {
+		blk_mq_freeze_queue(ns->head->disk->queue);
 		nvme_update_disk_info(ns->head->disk, ns, id);
 		blk_stack_limits(&ns->head->disk->queue->limits,
 				 &ns->queue->limits, 0);
 		blk_queue_update_readahead(ns->head->disk->queue);
 		nvme_update_bdev_size(ns->head->disk);
+		blk_mq_unfreeze_queue(ns->head->disk->queue);
 	}
 #endif
 	return 0;
+
+out_unfreeze:
+	blk_mq_unfreeze_queue(ns->disk->queue);
+	return ret;
 }
 
 static int nvme_validate_ns(struct nvme_ns *ns)
-- 
cgit v1.2.3


From 13f0b26bbf0a8660d23cad22ed0c4b58b5675dc4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:03:13 +0200
Subject: nvme: clean up the check for too large logic block sizes

Use a single statement to set both the capacity and fake block size
instead of two.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 82cd03c0ba21..0114fe47de35 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2017,8 +2017,12 @@ static void nvme_update_disk_info(struct gendisk *disk,
 	unsigned short bs = 1 << ns->lba_shift;
 	u32 atomic_bs, phys_bs, io_opt = 0;
 
+	/*
+	 * The block layer can't support LBA sizes larger than the page size
+	 * yet, so catch this early and don't allow block I/O.
+	 */
 	if (ns->lba_shift > PAGE_SHIFT) {
-		/* unsupported block size, set capacity to 0 later */
+		capacity = 0;
 		bs = (1 << 9);
 	}
 
@@ -2055,13 +2059,6 @@ static void nvme_update_disk_info(struct gendisk *disk,
 	blk_queue_io_min(disk->queue, phys_bs);
 	blk_queue_io_opt(disk->queue, io_opt);
 
-	/*
-	 * The block layer can't support LBA sizes larger than the page size
-	 * yet, so catch this early and don't allow block I/O.
-	 */
-	if (ns->lba_shift > PAGE_SHIFT)
-		capacity = 0;
-
 	/*
 	 * Register a metadata profile for PI, or the plain non-integrity NVMe
 	 * metadata masquerading as Type 0 if supported, otherwise reject block
-- 
cgit v1.2.3


From 310b30e575b1e2b9a569c3582062b79c5a562fb7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:12:02 +0200
Subject: nvme: remove the 0 lba_shift check in nvme_update_ns_info

We can no longer reach this code if Identify Namespace failed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0114fe47de35..910198c3e0bb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2129,13 +2129,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 	int ret;
 
 	blk_mq_freeze_queue(ns->disk->queue);
-	/*
-	 * If identify namespace failed, use default 512 byte block size so
-	 * block layer can use before failing read/write for 0 capacity.
-	 */
 	ns->lba_shift = id->lbaf[lbaf].ds;
-	if (ns->lba_shift == 0)
-		ns->lba_shift = 9;
 
 	switch (ns->head->ids.csi) {
 	case NVME_CSI_NVM:
-- 
cgit v1.2.3


From 658d9f7c2c7044f9978623e7f429b85bbb7553a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:05:28 +0200
Subject: nvme: set the queue limits in nvme_update_ns_info

Only set the queue limits once we have the real block size.  This also
updates the limits on a rescan if needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 910198c3e0bb..bb630d5fcb96 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2010,6 +2010,26 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
 	return 0;
 }
 
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+		struct request_queue *q)
+{
+	bool vwc = false;
+
+	if (ctrl->max_hw_sectors) {
+		u32 max_segments =
+			(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
+
+		max_segments = min_not_zero(max_segments, ctrl->max_segments);
+		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+	}
+	blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
+	blk_queue_dma_alignment(q, 7);
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		vwc = true;
+	blk_queue_write_cache(q, vwc, vwc);
+}
+
 static void nvme_update_disk_info(struct gendisk *disk,
 		struct nvme_ns *ns, struct nvme_id_ns *id)
 {
@@ -2130,6 +2150,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 
 	blk_mq_freeze_queue(ns->disk->queue);
 	ns->lba_shift = id->lbaf[lbaf].ds;
+	nvme_set_queue_limits(ctrl, ns->queue);
 
 	switch (ns->head->ids.csi) {
 	case NVME_CSI_NVM:
@@ -2495,26 +2516,6 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
 
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
-		struct request_queue *q)
-{
-	bool vwc = false;
-
-	if (ctrl->max_hw_sectors) {
-		u32 max_segments =
-			(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
-
-		max_segments = min_not_zero(max_segments, ctrl->max_segments);
-		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
-		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
-	}
-	blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
-	blk_queue_dma_alignment(q, 7);
-	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
-		vwc = true;
-	blk_queue_write_cache(q, vwc, vwc);
-}
-
 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
 {
 	__le64 ts;
@@ -3922,12 +3923,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	ns->queue->queuedata = ns;
 	ns->ctrl = ctrl;
-
 	kref_init(&ns->kref);
-	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
-
-	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-	nvme_set_queue_limits(ctrl, ns->queue);
 
 	ret = nvme_init_ns_head(ns, nsid, id);
 	if (ret)
-- 
cgit v1.2.3


From 75eb779ee0d343d1eb6d8c19205bc76a1d3ba2ed Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Sep 2020 08:34:43 +0200
Subject: nvme: update the known admin effects

A Format NVM command can change the capabilities of namespaces, while
Sanitize does change the Logical Block Content and must be serialized.

Also remove CSUPP bit for Format - it is not a mandatory command,
and we don't check for the bit anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bb630d5fcb96..4737591c1143 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -968,10 +968,10 @@ static u32 nvme_known_admin_effects(u8 opcode)
 {
 	switch (opcode) {
 	case nvme_admin_format_nvm:
-		return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
 			NVME_CMD_EFFECTS_CSE_MASK;
 	case nvme_admin_sanitize_nvm:
-		return NVME_CMD_EFFECTS_CSE_MASK;
+		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From af0f446d2cad06bd678e9b912f7653b63d87fd6b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 11:10:36 +0200
Subject: nvme: remove nvme_update_formats

Now that the queue is frozen before updating ->lba_shift we can't hit the
invalid references mentioned in the comment any more.  More importantly
this code would not have helped us if the format was changed by another
controller or through implementation defined back channels.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4737591c1143..f19f6c7c5b12 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,7 +89,6 @@ static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 static struct class *nvme_subsys_class;
 
-static int nvme_validate_ns(struct nvme_ns *ns);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 					   unsigned nsid);
@@ -1009,7 +1008,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	 * For simplicity, IO to all namespaces is quiesced even if the command
 	 * effects say only one namespace is affected.
 	 */
-	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
 		mutex_lock(&ctrl->scan_lock);
 		mutex_lock(&ctrl->subsys->lock);
 		nvme_mpath_start_freeze(ctrl->subsys);
@@ -1020,36 +1019,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return effects;
 }
 
-static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
-{
-	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		if (nvme_validate_ns(ns))
-			nvme_set_queue_dying(ns);
-		else if (blk_queue_is_zoned(ns->disk->queue)) {
-			/*
-			 * IO commands are required to fully revalidate a zoned
-			 * device. Force the command effects to trigger rescan
-			 * work so report zones can run in a context with
-			 * unfrozen IO queues.
-			 */
-			*effects |= NVME_CMD_EFFECTS_NCC;
-		}
-	up_read(&ctrl->namespaces_rwsem);
-}
-
 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
 {
-	/*
-	 * Revalidate LBA changes prior to unfreezing. This is necessary to
-	 * prevent memory corruption if a logical block size was changed by
-	 * this command.
-	 */
-	if (effects & NVME_CMD_EFFECTS_LBCC)
-		nvme_update_formats(ctrl, &effects);
-	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
 		nvme_unfreeze(ctrl);
 		nvme_mpath_unfreeze(ctrl->subsys);
 		mutex_unlock(&ctrl->subsys->lock);
-- 
cgit v1.2.3


From 3a9967ba7ace91153f9caa8e60a55c7668c7b946 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 12:30:16 +0200
Subject: nvme: revalidate zone bitmaps in nvme_update_ns_info

Consolidate the two calls into a single place.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f19f6c7c5b12..9c137d8819f7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2150,6 +2150,12 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 	nvme_update_disk_info(ns->disk, ns, id);
 	blk_mq_unfreeze_queue(ns->disk->queue);
 
+	if (blk_queue_is_zoned(ns->queue)) {
+		ret = nvme_revalidate_zones(ns);
+		if (ret)
+			return ret;
+	}
+
 #ifdef CONFIG_NVME_MULTIPATH
 	if (ns->head->disk) {
 		blk_mq_freeze_queue(ns->head->disk->queue);
@@ -3915,8 +3921,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	if (nvme_update_ns_info(ns, id))
 		goto out_put_disk;
-	if (blk_queue_is_zoned(ns->queue) && nvme_revalidate_zones(ns))
-		goto out_put_disk;
 
 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
 		ret = nvme_nvm_register(ns, disk_name, node);
@@ -4012,8 +4016,6 @@ static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	}
 
 	ret = nvme_validate_ns(ns);
-	if (!ret && blk_queue_is_zoned(ns->queue))
-		ret = nvme_revalidate_zones(ns);
 	revalidate_disk_size(ns->disk, ret == 0);
 	if (ret)
 		nvme_ns_remove(ns);
-- 
cgit v1.2.3


From 8b7c0ff2d46dad4974e84f2363d7e0ddefaf0677 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 14:07:56 +0200
Subject: nvme: query namespace identifiers before adding the namespace

Check the namespace identifier list first thing when scanning namespaces.
This keeps the code to query the CSI common between the alloc and validate
path, and helps to structure the code better for multiple command set
support.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 116 ++++++++++++++++++++++-------------------------
 1 file changed, 53 insertions(+), 63 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9c137d8819f7..ad18c32b36e7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1281,6 +1281,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 	int status, pos, len;
 	void *data;
 
+	if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
+		return 0;
 	if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
 		return 0;
 
@@ -1335,8 +1337,8 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
 				    NVME_IDENTIFY_DATA_SIZE);
 }
 
-static int nvme_identify_ns(struct nvme_ctrl *ctrl,
-		unsigned nsid, struct nvme_id_ns **id)
+static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+			struct nvme_ns_ids *ids, struct nvme_id_ns **id)
 {
 	struct nvme_command c = { };
 	int error;
@@ -1359,6 +1361,14 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl,
 	error = -ENODEV;
 	if ((*id)->ncap == 0) /* namespace not allocated or attached */
 		goto out_free_id;
+
+	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+	    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+		memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
+	if (ctrl->vs >= NVME_VS(1, 2, 0) &&
+	    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+		memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+
 	return 0;
 
 out_free_id:
@@ -1884,20 +1894,6 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
 					   nvme_lba_to_sect(ns, max_blocks));
 }
 
-static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
-		struct nvme_id_ns *id, struct nvme_ns_ids *ids)
-{
-	memset(ids, 0, sizeof(*ids));
-
-	if (ctrl->vs >= NVME_VS(1, 1, 0))
-		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
-	if (ctrl->vs >= NVME_VS(1, 2, 0))
-		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
-	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
-		return nvme_identify_ns_descs(ctrl, nsid, ids);
-	return 0;
-}
-
 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
 {
 	return !uuid_is_null(&ids->uuid) ||
@@ -2117,30 +2113,16 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
 	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
-	struct nvme_ctrl *ctrl = ns->ctrl;
 	int ret;
 
 	blk_mq_freeze_queue(ns->disk->queue);
 	ns->lba_shift = id->lbaf[lbaf].ds;
-	nvme_set_queue_limits(ctrl, ns->queue);
+	nvme_set_queue_limits(ns->ctrl, ns->queue);
 
-	switch (ns->head->ids.csi) {
-	case NVME_CSI_NVM:
-		break;
-	case NVME_CSI_ZNS:
+	if (ns->head->ids.csi == NVME_CSI_ZNS) {
 		ret = nvme_update_zone_info(ns, lbaf);
-		if (ret) {
-			dev_warn(ctrl->device,
-				"failed to add zoned namespace:%u ret:%d\n",
-				ns->head->ns_id, ret);
+		if (ret)
 			goto out_unfreeze;
-		}
-		break;
-	default:
-		dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
-			ns->head->ids.csi, ns->head->ns_id);
-		ret = -ENODEV;
-		goto out_unfreeze;
 	}
 
 	ret = nvme_configure_metadata(ns, id);
@@ -2174,11 +2156,10 @@ out_unfreeze:
 	return ret;
 }
 
-static int nvme_validate_ns(struct nvme_ns *ns)
+static int nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_id_ns *id;
-	struct nvme_ns_ids ids;
 	int ret = 0;
 
 	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
@@ -2186,15 +2167,11 @@ static int nvme_validate_ns(struct nvme_ns *ns)
 		return -ENODEV;
 	}
 
-	ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
+	ret = nvme_identify_ns(ctrl, ns->head->ns_id, ids, &id);
 	if (ret)
 		goto out;
 
-	ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
-	if (ret)
-		goto free_id;
-
-	if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
+	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
 		dev_err(ctrl->device,
 			"identifiers changed for nsid %d\n", ns->head->ns_id);
 		ret = -ENODEV;
@@ -3794,25 +3771,16 @@ out:
 }
 
 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
-		struct nvme_id_ns *id)
+		struct nvme_ns_ids *ids, bool is_shared)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
-	bool is_shared = id->nmic & NVME_NS_NMIC_SHARED;
 	struct nvme_ns_head *head = NULL;
-	struct nvme_ns_ids ids;
 	int ret = 0;
 
-	ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
-	if (ret) {
-		if (ret < 0)
-			return ret;
-		return blk_status_to_errno(nvme_error_status(ret));
-	}
-
 	mutex_lock(&ctrl->subsys->lock);
 	head = nvme_find_ns_head(ctrl->subsys, nsid);
 	if (!head) {
-		head = nvme_alloc_ns_head(ctrl, nsid, &ids);
+		head = nvme_alloc_ns_head(ctrl, nsid, ids);
 		if (IS_ERR(head)) {
 			ret = PTR_ERR(head);
 			goto out_unlock;
@@ -3825,7 +3793,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
 				"Duplicate unshared namespace %d\n", nsid);
 			goto out_put_ns_head;
 		}
-		if (!nvme_ns_ids_equal(&head->ids, &ids)) {
+		if (!nvme_ns_ids_equal(&head->ids, ids)) {
 			dev_err(ctrl->device,
 				"IDs don't match for shared namespace %d\n",
 					nsid);
@@ -3873,7 +3841,8 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 }
 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
 
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+		struct nvme_ns_ids *ids)
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
@@ -3881,7 +3850,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	char disk_name[DISK_NAME_LEN];
 	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
 
-	if (nvme_identify_ns(ctrl, nsid, &id))
+	if (nvme_identify_ns(ctrl, nsid, ids, &id))
 		return;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
@@ -3903,7 +3872,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	ns->ctrl = ctrl;
 	kref_init(&ns->kref);
 
-	ret = nvme_init_ns_head(ns, nsid, id);
+	ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED);
 	if (ret)
 		goto out_free_queue;
 	nvme_set_disk_name(disk_name, ns, ctrl, &flags);
@@ -4006,20 +3975,41 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
 
 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
+	struct nvme_ns_ids ids = { };
 	struct nvme_ns *ns;
 	int ret;
 
+	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
+		return;
+
 	ns = nvme_find_get_ns(ctrl, nsid);
-	if (!ns) {
-		nvme_alloc_ns(ctrl, nsid);
+	if (ns) {
+		ret = nvme_validate_ns(ns, &ids);
+		revalidate_disk_size(ns->disk, ret == 0);
+		if (ret)
+			nvme_ns_remove(ns);
+		nvme_put_ns(ns);
 		return;
 	}
 
-	ret = nvme_validate_ns(ns);
-	revalidate_disk_size(ns->disk, ret == 0);
-	if (ret)
-		nvme_ns_remove(ns);
-	nvme_put_ns(ns);
+	switch (ids.csi) {
+	case NVME_CSI_NVM:
+		nvme_alloc_ns(ctrl, nsid, &ids);
+		break;
+	case NVME_CSI_ZNS:
+		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			dev_warn(ctrl->device,
+				"nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
+				nsid);
+			break;
+		}
+		nvme_alloc_ns(ctrl, nsid, &ids);
+		break;
+	default:
+		dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
+			ids.csi, nsid);
+		break;
+	}
 }
 
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
-- 
cgit v1.2.3


From b2dc748a70c65a1b4eb1b9fceab57662cfd83e41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 13:55:22 +0200
Subject: nvme: move nvme_validate_ns

Move nvme_validate_ns just above its only remaining caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 74 ++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ad18c32b36e7..07309f6c14fa 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2156,43 +2156,6 @@ out_unfreeze:
 	return ret;
 }
 
-static int nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
-{
-	struct nvme_ctrl *ctrl = ns->ctrl;
-	struct nvme_id_ns *id;
-	int ret = 0;
-
-	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
-		set_capacity(ns->disk, 0);
-		return -ENODEV;
-	}
-
-	ret = nvme_identify_ns(ctrl, ns->head->ns_id, ids, &id);
-	if (ret)
-		goto out;
-
-	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
-		dev_err(ctrl->device,
-			"identifiers changed for nsid %d\n", ns->head->ns_id);
-		ret = -ENODEV;
-		goto free_id;
-	}
-
-	ret = nvme_update_ns_info(ns, id);
-free_id:
-	kfree(id);
-out:
-	/*
-	 * Only fail the function if we got a fatal error back from the
-	 * device, otherwise ignore the error and just move on.
-	 */
-	if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
-		ret = 0;
-	else if (ret > 0)
-		ret = blk_status_to_errno(nvme_error_status(ret));
-	return ret;
-}
-
 static char nvme_pr_type(enum pr_type type)
 {
 	switch (type) {
@@ -3973,6 +3936,43 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
 	}
 }
 
+static int nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_id_ns *id;
+	int ret = 0;
+
+	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+		set_capacity(ns->disk, 0);
+		return -ENODEV;
+	}
+
+	ret = nvme_identify_ns(ctrl, ns->head->ns_id, ids, &id);
+	if (ret)
+		goto out;
+
+	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
+		dev_err(ctrl->device,
+			"identifiers changed for nsid %d\n", ns->head->ns_id);
+		ret = -ENODEV;
+		goto free_id;
+	}
+
+	ret = nvme_update_ns_info(ns, id);
+free_id:
+	kfree(id);
+out:
+	/*
+	 * Only fail the function if we got a fatal error back from the
+	 * device, otherwise ignore the error and just move on.
+	 */
+	if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
+		ret = 0;
+	else if (ret > 0)
+		ret = blk_status_to_errno(nvme_error_status(ret));
+	return ret;
+}
+
 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns_ids ids = { };
-- 
cgit v1.2.3


From 0a05226a3a2038b28d78101239196222d375124a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 13:59:06 +0200
Subject: nvme: refactor nvme_validate_ns

Move the logic to revalidate the block_device size or remove the
namespace from the caller into nvme_validate_ns.  This removes
the return value and thus the status code translation.  Additionally
it also catches non-permanent errors from nvme_update_ns_info using
the existing logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/nvme/host/core.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 07309f6c14fa..0b88a377a47f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3899,6 +3899,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
+	set_capacity(ns->disk, 0);
 	nvme_fault_inject_fini(&ns->fault_inject);
 
 	mutex_lock(&ns->ctrl->subsys->lock);
@@ -3936,58 +3937,54 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
 	}
 }
 
-static int nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
+static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_id_ns *id;
-	int ret = 0;
+	int ret = -ENODEV;
 
-	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
-		set_capacity(ns->disk, 0);
-		return -ENODEV;
-	}
+	if (test_bit(NVME_NS_DEAD, &ns->flags))
+		goto out;
 
 	ret = nvme_identify_ns(ctrl, ns->head->ns_id, ids, &id);
 	if (ret)
 		goto out;
 
+	ret = -ENODEV;
 	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
 		dev_err(ctrl->device,
 			"identifiers changed for nsid %d\n", ns->head->ns_id);
-		ret = -ENODEV;
-		goto free_id;
+		goto out_free_id;
 	}
 
 	ret = nvme_update_ns_info(ns, id);
-free_id:
+
+out_free_id:
 	kfree(id);
 out:
 	/*
-	 * Only fail the function if we got a fatal error back from the
+	 * Only remove the namespace if we got a fatal error back from the
 	 * device, otherwise ignore the error and just move on.
+	 *
+	 * TODO: we should probably schedule a delayed retry here.
 	 */
-	if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
-		ret = 0;
-	else if (ret > 0)
-		ret = blk_status_to_errno(nvme_error_status(ret));
-	return ret;
+	if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR)))
+		nvme_ns_remove(ns);
+	else
+		revalidate_disk_size(ns->disk, true);
 }
 
 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns_ids ids = { };
 	struct nvme_ns *ns;
-	int ret;
 
 	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
 		return;
 
 	ns = nvme_find_get_ns(ctrl, nsid);
 	if (ns) {
-		ret = nvme_validate_ns(ns, &ids);
-		revalidate_disk_size(ns->disk, ret == 0);
-		if (ret)
-			nvme_ns_remove(ns);
+		nvme_validate_ns(ns, &ids);
 		nvme_put_ns(ns);
 		return;
 	}
-- 
cgit v1.2.3


From 7b15336257ed5623cddefa8d33069777494c98e0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Sep 2020 14:08:28 +0200
Subject: nvme: remove nvme_identify_ns_list

Just fold it into the only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
---
 drivers/nvme/host/core.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0b88a377a47f..385b10317873 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1326,17 +1326,6 @@ free_data:
 	return status;
 }
 
-static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
-{
-	struct nvme_command c = { };
-
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
-	c.identify.nsid = cpu_to_le32(nsid);
-	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
-				    NVME_IDENTIFY_DATA_SIZE);
-}
-
 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 			struct nvme_ns_ids *ids, struct nvme_id_ns **id)
 {
@@ -4042,7 +4031,14 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
 		return -ENOMEM;
 
 	for (;;) {
-		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
+		struct nvme_command cmd = {
+			.identify.opcode	= nvme_admin_identify,
+			.identify.cns		= NVME_ID_CNS_NS_ACTIVE_LIST,
+			.identify.nsid		= cpu_to_le32(prev),
+		};
+
+		ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
+					    NVME_IDENTIFY_DATA_SIZE);
 		if (ret)
 			goto free;
 
-- 
cgit v1.2.3


From af5d6f7ba5f99f8316473557240ae9acdd20a6bd Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Thu, 1 Oct 2020 11:54:31 -0700
Subject: nvme-core: remove extra variable

In nvme_validate_ns() the exra variable ctrl is used only twice.
Using ns->ctrl directly still maintains the redability and original
length of the lines in the code. Get rid of the extra variable ctrl &
use ns->ctrl directly.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 385b10317873..2e505cdf051e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3928,20 +3928,19 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
 
 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
 {
-	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_id_ns *id;
 	int ret = -ENODEV;
 
 	if (test_bit(NVME_NS_DEAD, &ns->flags))
 		goto out;
 
-	ret = nvme_identify_ns(ctrl, ns->head->ns_id, ids, &id);
+	ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
 	if (ret)
 		goto out;
 
 	ret = -ENODEV;
 	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
-		dev_err(ctrl->device,
+		dev_err(ns->ctrl->device,
 			"identifiers changed for nsid %d\n", ns->head->ns_id);
 		goto out_free_id;
 	}
-- 
cgit v1.2.3


From c4485252cf36ae62c8bf12c4aede72345cad0d2b Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Thu, 1 Oct 2020 11:54:32 -0700
Subject: nvme-core: remove extra condition for vwc

In nvme_set_queue_limits() we initialize vwc to false and later add
a condition to set vwc true. The value of the vwc can be declare
initialized which makes all the blk_queue_XXX() calls uniform.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2e505cdf051e..e85f6304efd7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1970,7 +1970,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
-	bool vwc = false;
+	bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
 
 	if (ctrl->max_hw_sectors) {
 		u32 max_segments =
@@ -1982,8 +1982,6 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 	}
 	blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
 	blk_queue_dma_alignment(q, 7);
-	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
-		vwc = true;
 	blk_queue_write_cache(q, vwc, vwc);
 }
 
-- 
cgit v1.2.3


From d7a1c483f797fb92c091e66b6a6e941f766b84e0 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Tue, 29 Sep 2020 14:47:59 +0800
Subject: md/raid0: remove unused function is_io_in_chunk_boundary()

This function is no longger needed after commit 20d0189b1012 ("block:
Introduce new bio_split()").

Signed-off-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid0.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e2de0cced707..6f44177593a5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -426,23 +426,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
 	kfree(conf);
 }
 
-/*
- * Is io distribute over 1 or more chunks ?
-*/
-static inline int is_io_in_chunk_boundary(struct mddev *mddev,
-			unsigned int chunk_sects, struct bio *bio)
-{
-	if (likely(is_power_of_2(chunk_sects))) {
-		return chunk_sects >=
-			((bio->bi_iter.bi_sector & (chunk_sects-1))
-					+ bio_sectors(bio));
-	} else{
-		sector_t sector = bio->bi_iter.bi_sector;
-		return chunk_sects >= (sector_div(sector, chunk_sects)
-						+ bio_sectors(bio));
-	}
-}
-
 static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 {
 	struct r0conf *conf = mddev->private;
-- 
cgit v1.2.3


From a913096decbf4101271e7d87b9affb1454bb7676 Mon Sep 17 00:00:00 2001
From: Zhao Heming <heming.zhao@suse.com>
Date: Tue, 6 Oct 2020 00:00:23 +0800
Subject: md/bitmap: md_bitmap_read_sb uses wrong bitmap blocks

The patched code is used to get chunks number, should use round-up div
to replace current sector_div. The same code is in md_bitmap_resize():
```
chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
```

Signed-off-by: Zhao Heming <heming.zhao@suse.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 55b757a223a4..cd9a12ca281d 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -606,8 +606,8 @@ re_read:
 	if (bitmap->cluster_slot >= 0) {
 		sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
 
-		sector_div(bm_blocks,
-			   bitmap->mddev->bitmap_info.chunksize >> 9);
+		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks,
+			   (bitmap->mddev->bitmap_info.chunksize >> 9));
 		/* bits to bytes */
 		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
 		/* to 4k blocks */
-- 
cgit v1.2.3


From d837f7277f56e70d82b3a4a037d744854e62f387 Mon Sep 17 00:00:00 2001
From: Zhao Heming <heming.zhao@suse.com>
Date: Tue, 6 Oct 2020 00:00:24 +0800
Subject: md/bitmap: md_bitmap_get_counter returns wrong blocks

md_bitmap_get_counter() has code:

```
    if (bitmap->bp[page].hijacked ||
        bitmap->bp[page].map == NULL)
        csize = ((sector_t)1) << (bitmap->chunkshift +
                      PAGE_COUNTER_SHIFT - 1);
```

The minus 1 is wrong, this branch should report 2048 bits of space.
With "-1" action, this only report 1024 bit of space.

This bug code returns wrong blocks, but it doesn't inflence bitmap logic:
1. Most callers focus this function return value (the counter of offset),
   not the parameter blocks.
2. The bug is only triggered when hijacked is true or map is NULL.
   the hijacked true condition is very rare.
   the "map == null" only true when array is creating or resizing.
3. Even the caller gets wrong blocks, current code makes caller just to
   call md_bitmap_get_counter() one more time.

Signed-off-by: Zhao Heming <heming.zhao@suse.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index cd9a12ca281d..b3f13105659d 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1368,7 +1368,7 @@ __acquires(bitmap->lock)
 	if (bitmap->bp[page].hijacked ||
 	    bitmap->bp[page].map == NULL)
 		csize = ((sector_t)1) << (bitmap->chunkshift +
-					  PAGE_COUNTER_SHIFT - 1);
+					  PAGE_COUNTER_SHIFT);
 	else
 		csize = ((sector_t)1) << bitmap->chunkshift;
 	*blocks = csize - (offset & (csize - 1));
-- 
cgit v1.2.3


From cf0b9b4821a2955f8a23813ef8f422208ced9bd7 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Thu, 8 Oct 2020 05:19:09 +0200
Subject: md: fix the checking of wrong work queue

It should check md_rdev_misc_wq instead of md_misc_wq.

Fixes: cc1ffe61c026 ("md: add new workqueue for delete rdev")
Cc: <stable@vger.kernel.org> # v5.8+
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index dfff4bb2bb20..f1c6304c79f4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9565,7 +9565,7 @@ static int __init md_init(void)
 		goto err_misc_wq;
 
 	md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
-	if (!md_misc_wq)
+	if (!md_rdev_misc_wq)
 		goto err_rdev_misc_wq;
 
 	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
-- 
cgit v1.2.3


From 1383b347a8ae4a69c04ae3746e6cb5c8d38e2585 Mon Sep 17 00:00:00 2001
From: Zhao Heming <heming.zhao@suse.com>
Date: Sun, 27 Sep 2020 13:40:13 +0800
Subject: md/bitmap: fix memory leak of temporary bitmap

Callers of get_bitmap_from_slot() are responsible to free the bitmap.

Suggested-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Zhao Heming <heming.zhao@suse.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c  | 3 ++-
 drivers/md/md-cluster.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index b3f13105659d..200c5d0f08bf 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1950,6 +1950,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(md_bitmap_load);
 
+/* caller need to free returned bitmap with md_bitmap_free() */
 struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
 {
 	int rv = 0;
@@ -2013,6 +2014,7 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	md_bitmap_unplug(mddev->bitmap);
 	*low = lo;
 	*high = hi;
+	md_bitmap_free(bitmap);
 
 	return rv;
 }
@@ -2616,4 +2618,3 @@ struct attribute_group md_bitmap_group = {
 	.name = "bitmap",
 	.attrs = md_bitmap_attrs,
 };
-
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0580b51a156a..4aaf4820b6f6 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1166,6 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
 			 * can't resize bitmap
 			 */
 			goto out;
+		md_bitmap_free(bitmap);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From b44c018cdf748b96b676ba09fdbc5b34fc443ada Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 5 Oct 2020 09:35:21 -0700
Subject: md/raid5: fix oops during stripe resizing

KoWei reported crash during raid5 reshape:

[ 1032.252932] Oops: 0002 [#1] SMP PTI
[...]
[ 1032.252943] RIP: 0010:memcpy_erms+0x6/0x10
[...]
[ 1032.252947] RSP: 0018:ffffba1ac0c03b78 EFLAGS: 00010286
[ 1032.252949] RAX: 0000784ac0000000 RBX: ffff91bec3d09740 RCX: 0000000000001000
[ 1032.252951] RDX: 0000000000001000 RSI: ffff91be6781c000 RDI: 0000784ac0000000
[ 1032.252953] RBP: ffffba1ac0c03bd8 R08: 0000000000001000 R09: ffffba1ac0c03bf8
[ 1032.252954] R10: 0000000000000000 R11: 0000000000000000 R12: ffffba1ac0c03bf8
[ 1032.252955] R13: 0000000000001000 R14: 0000000000000000 R15: 0000000000000000
[ 1032.252958] FS:  0000000000000000(0000) GS:ffff91becf500000(0000) knlGS:0000000000000000
[ 1032.252959] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1032.252961] CR2: 0000784ac0000000 CR3: 000000031780a002 CR4: 00000000001606e0
[ 1032.252962] Call Trace:
[ 1032.252969]  ? async_memcpy+0x179/0x1000 [async_memcpy]
[ 1032.252977]  ? raid5_release_stripe+0x8e/0x110 [raid456]
[ 1032.252982]  handle_stripe_expansion+0x15a/0x1f0 [raid456]
[ 1032.252988]  handle_stripe+0x592/0x1270 [raid456]
[ 1032.252993]  handle_active_stripes.isra.0+0x3cb/0x5a0 [raid456]
[ 1032.252999]  raid5d+0x35c/0x550 [raid456]
[ 1032.253002]  ? schedule+0x42/0xb0
[ 1032.253006]  ? schedule_timeout+0x10e/0x160
[ 1032.253011]  md_thread+0x97/0x160
[ 1032.253015]  ? wait_woken+0x80/0x80
[ 1032.253019]  kthread+0x104/0x140
[ 1032.253022]  ? md_start_sync+0x60/0x60
[ 1032.253024]  ? kthread_park+0x90/0x90
[ 1032.253027]  ret_from_fork+0x35/0x40

This is because cache_size_mutex was unlocked too early in resize_stripes,
which races with grow_one_stripe() that grow_one_stripe() allocates a
stripe with wrong pool_size.

Fix this issue by unlocking cache_size_mutex after updating pool_size.

Cc: <stable@vger.kernel.org> # v4.4+
Reported-by: KoWei Sung <winders@amazon.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 66690b40818e..39343479ac2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2585,8 +2585,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	} else
 		err = -ENOMEM;
 
-	mutex_unlock(&conf->cache_size_mutex);
-
 	conf->slab_cache = sc;
 	conf->active_name = 1-conf->active_name;
 
@@ -2628,6 +2626,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
 	if (!err)
 		conf->pool_size = newsize;
+	mutex_unlock(&conf->cache_size_mutex);
+
 	return err;
 }
 
-- 
cgit v1.2.3