From 8dc23658b7aaa8b6b0609c81c8ad75e98b612801 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnghuan@gmail.com>
Date: Tue, 6 Sep 2016 16:00:29 +0800
Subject: dm: return correct error code in dm_resume()'s retry loop

dm_resume() will return success (0) rather than -EINVAL if
!dm_suspended_md() upon retry within dm_resume().

Reset the error code at the start of dm_resume()'s retry loop.
Also, remove a useless assignment at the end of dm_resume().

Fixes: ffcc393641 ("dm: enhance internal suspend and resume interface")
Cc: stable@vger.kernel.org # 3.19+
Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fa9b1cb4438a..c0742129a9cb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2249,10 +2249,11 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
 
 int dm_resume(struct mapped_device *md)
 {
-	int r = -EINVAL;
+	int r;
 	struct dm_table *map = NULL;
 
 retry:
+	r = -EINVAL;
 	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
 
 	if (!dm_suspended_md(md))
@@ -2276,8 +2277,6 @@ retry:
 		goto out;
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
-
-	r = 0;
 out:
 	mutex_unlock(&md->suspend_lock);
 
-- 
cgit v1.2.3


From 3b785fbcf81c3533772c52b717f77293099498d3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 31 Aug 2016 15:17:49 -0700
Subject: dm: mark request_queue dead before destroying the DM device

This avoids that new requests are queued while __dm_destroy() is in
progress.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c0742129a9cb..0f2928b3136b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1873,6 +1873,7 @@ EXPORT_SYMBOL_GPL(dm_device_name);
 
 static void __dm_destroy(struct mapped_device *md, bool wait)
 {
+	struct request_queue *q = dm_get_md_queue(md);
 	struct dm_table *map;
 	int srcu_idx;
 
@@ -1883,6 +1884,10 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
 
+	spin_lock_irq(q->queue_lock);
+	queue_flag_set(QUEUE_FLAG_DYING, q);
+	spin_unlock_irq(q->queue_lock);
+
 	if (dm_request_based(md) && md->kworker_task)
 		flush_kthread_worker(&md->kworker);
 
-- 
cgit v1.2.3


From 2397a15aff35b5b4eed732ce81fda5a9d15053f9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 31 Aug 2016 15:18:11 -0700
Subject: dm rq: factor out dm_mq_stop_queue()

Also, check that the blk-mq request_queue isn't already stopped.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1ca7463e8bb2..2f605f62e47d 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -98,18 +98,30 @@ static void dm_old_stop_queue(struct request_queue *q)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void dm_mq_stop_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_queue_stopped(q)) {
+		spin_unlock_irqrestore(q->queue_lock, flags);
+		return;
+	}
+
+	queue_flag_set(QUEUE_FLAG_STOPPED, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/* Avoid that requeuing could restart the queue. */
+	blk_mq_cancel_requeue_work(q);
+	blk_mq_stop_hw_queues(q);
+}
+
 void dm_stop_queue(struct request_queue *q)
 {
 	if (!q->mq_ops)
 		dm_old_stop_queue(q);
-	else {
-		spin_lock_irq(q->queue_lock);
-		queue_flag_set(QUEUE_FLAG_STOPPED, q);
-		spin_unlock_irq(q->queue_lock);
-
-		blk_mq_cancel_requeue_work(q);
-		blk_mq_stop_hw_queues(q);
-	}
+	else
+		dm_mq_stop_queue(q);
 }
 
 static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
-- 
cgit v1.2.3


From 9dbeaeabacb26260d1621fe58f0f6fdedc8860d4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 1 Sep 2016 11:59:33 -0400
Subject: dm rq: take request_queue lock while clearing QUEUE_FLAG_STOPPED

Every call of queue_flag_clear_unlocked() after block device
initialization has finished is wrong if blk_cleanup_queue() can be
called concurrently.  Convert queue_flag_clear_unlocked() into
queue_flag_clear() and protect it by the block layer queue lock.

Also, factor out dm_mq_start_queue().

Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-rq.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 2f605f62e47d..bd3ba97d44a2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -73,15 +73,24 @@ static void dm_old_start_queue(struct request_queue *q)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void dm_mq_start_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	blk_mq_start_stopped_hw_queues(q, true);
+	blk_mq_kick_requeue_list(q);
+}
+
 void dm_start_queue(struct request_queue *q)
 {
 	if (!q->mq_ops)
 		dm_old_start_queue(q);
-	else {
-		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, q);
-		blk_mq_start_stopped_hw_queues(q, true);
-		blk_mq_kick_requeue_list(q);
-	}
+	else
+		dm_mq_start_queue(q);
 }
 
 static void dm_old_stop_queue(struct request_queue *q)
-- 
cgit v1.2.3


From f10e06b744074824fb8ec7066bc03ecc90918f5b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 1 Sep 2016 12:06:37 -0400
Subject: dm mpath: check if path's request_queue is dying in activate_path()

If pg_init_retries is set and a request is queued against a multipath
device with all underlying block device request_queues in the "dying"
state then an infinite loop is triggered because activate_path() never
succeeds and hence never calls pg_init_done().

This change avoids that device removal triggers an infinite loop by
failing the activate_path() which causes the "dying" path to be failed.

Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-mpath.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index ac734e5bbe48..15db5e9c572e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1521,10 +1521,10 @@ static void activate_path(struct work_struct *work)
 {
 	struct pgpath *pgpath =
 		container_of(work, struct pgpath, activate_path.work);
+	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 
-	if (pgpath->is_active)
-		scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
-				 pg_init_done, pgpath);
+	if (pgpath->is_active && !blk_queue_dying(q))
+		scsi_dh_activate(q, pg_init_done, pgpath);
 	else
 		pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
 }
-- 
cgit v1.2.3


From c533f249a166142df4294ec38fa5dcd1903f0400 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 31 Aug 2016 15:17:24 -0700
Subject: dm rq: simplify dm_old_stop_queue()

This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index bd3ba97d44a2..0d301d5a4d0b 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -98,12 +98,8 @@ static void dm_old_stop_queue(struct request_queue *q)
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_queue_stopped(q)) {
-		spin_unlock_irqrestore(q->queue_lock, flags);
-		return;
-	}
-
-	blk_stop_queue(q);
+	if (!blk_queue_stopped(q))
+		blk_stop_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-- 
cgit v1.2.3


From 5a8f1f80e9dca791ee240213477df99e88258073 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 31 Aug 2016 15:17:04 -0700
Subject: dm: add two lockdep_assert_held() statements

Document the locking assumptions for the __bind() and __dm_suspend()
functions.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0f2928b3136b..0708c620c17a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1648,6 +1648,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	struct request_queue *q = md->queue;
 	sector_t size;
 
+	lockdep_assert_held(&md->suspend_lock);
+
 	size = dm_table_get_size(t);
 
 	/*
@@ -2094,6 +2096,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
 	int r;
 
+	lockdep_assert_held(&md->suspend_lock);
+
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
 	 * This flag is cleared before dm_suspend returns.
-- 
cgit v1.2.3


From b48633f83f22914073314d97d49da2a2e1d3b350 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 31 Aug 2016 15:16:02 -0700
Subject: dm: rename task state function arguments

Rename 'interruptible' into 'task_state' to make it clear that this
argument is a task state instead of a boolean.  Also, change type from
int to long.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0708c620c17a..4aaffe05de94 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1941,7 +1941,7 @@ void dm_put(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_put);
 
-static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
+static int dm_wait_for_completion(struct mapped_device *md, long task_state)
 {
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
@@ -1949,12 +1949,12 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	add_wait_queue(&md->wait, &wait);
 
 	while (1) {
-		set_current_state(interruptible);
+		set_current_state(task_state);
 
 		if (!md_in_flight(md))
 			break;
 
-		if (interruptible == TASK_INTERRUPTIBLE &&
+		if (task_state == TASK_INTERRUPTIBLE &&
 		    signal_pending(current)) {
 			r = -EINTR;
 			break;
@@ -2082,6 +2082,10 @@ static void unlock_fs(struct mapped_device *md)
 }
 
 /*
+ * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
+ * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
+ * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
+ *
  * If __dm_suspend returns 0, the device is completely quiescent
  * now. There is no request-processing activity. All new requests
  * are being added to md->deferred list.
@@ -2089,7 +2093,7 @@ static void unlock_fs(struct mapped_device *md)
  * Caller must hold md->suspend_lock
  */
 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
-			unsigned suspend_flags, int interruptible,
+			unsigned suspend_flags, long task_state,
 			int dmf_suspended_flag)
 {
 	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
@@ -2158,7 +2162,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 * We call dm_wait_for_completion to wait for all existing requests
 	 * to finish.
 	 */
-	r = dm_wait_for_completion(md, interruptible);
+	r = dm_wait_for_completion(md, task_state);
 	if (!r)
 		set_bit(dmf_suspended_flag, &md->flags);
 
-- 
cgit v1.2.3


From e3fabdfdf70e2b340cff968fd1d13e4c624de926 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 31 Aug 2016 15:16:22 -0700
Subject: dm: use signal_pending_state() in dm_wait_for_completion()

Use signal_pending_state() instead of open-coding it.  This patch does
not change any functionality but makes it possible to pass TASK_KILLABLE
as the second argument of dm_wait_for_completion().  See also commit
16882c1e962b ("sched: fix TASK_WAKEKILL vs SIGKILL race").

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>.
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4aaffe05de94..6678cb2c2138 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1954,8 +1954,7 @@ static int dm_wait_for_completion(struct mapped_device *md, long task_state)
 		if (!md_in_flight(md))
 			break;
 
-		if (task_state == TASK_INTERRUPTIBLE &&
-		    signal_pending(current)) {
+		if (signal_pending_state(task_state, current)) {
 			r = -EINTR;
 			break;
 		}
-- 
cgit v1.2.3


From 9f4c3f874a3ab8fb845dd2f04f4396ebc5c1f225 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 31 Aug 2016 15:16:43 -0700
Subject: dm: convert wait loops to use autoremove_wake_function()

Use autoremove_wake_function() instead of default_wake_function()
to make the dm wait loops more similar to other wait loops in the
kernel.  This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 10 +++-------
 drivers/md/dm.c       | 10 +++-------
 2 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 15db5e9c572e..c777d38f4b11 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1193,21 +1193,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 static void multipath_wait_for_pg_init_completion(struct multipath *m)
 {
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(&m->pg_init_wait, &wait);
+	DEFINE_WAIT(wait);
 
 	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
 
 		if (!atomic_read(&m->pg_init_in_progress))
 			break;
 
 		io_schedule();
 	}
-	set_current_state(TASK_RUNNING);
-
-	remove_wait_queue(&m->pg_init_wait, &wait);
+	finish_wait(&m->pg_init_wait, &wait);
 }
 
 static void flush_multipath_work(struct multipath *m)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6678cb2c2138..be35258324c1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1944,12 +1944,10 @@ EXPORT_SYMBOL_GPL(dm_put);
 static int dm_wait_for_completion(struct mapped_device *md, long task_state)
 {
 	int r = 0;
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(&md->wait, &wait);
+	DEFINE_WAIT(wait);
 
 	while (1) {
-		set_current_state(task_state);
+		prepare_to_wait(&md->wait, &wait, task_state);
 
 		if (!md_in_flight(md))
 			break;
@@ -1961,9 +1959,7 @@ static int dm_wait_for_completion(struct mapped_device *md, long task_state)
 
 		io_schedule();
 	}
-	set_current_state(TASK_RUNNING);
-
-	remove_wait_queue(&md->wait, &wait);
+	finish_wait(&md->wait, &wait);
 
 	return r;
 }
-- 
cgit v1.2.3


From a8ac51e4ab97765838ae6a07d6ff7f7bfaaa0ea3 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 9 Sep 2016 19:24:57 -0400
Subject: dm rq: add DM_MAPIO_DELAY_REQUEUE to delay requeue of blk-mq requests

Otherwise blk-mq will immediately dispatch requests that are requeued
via a BLK_MQ_RQ_QUEUE_BUSY return from blk_mq_ops .queue_rq.

Delayed requeue is implemented using blk_mq_delay_kick_requeue_list()
with a delay of 5 secs.  In the context of DM multipath (all paths down)
it doesn't make any sense to requeue more quickly.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 0d301d5a4d0b..dbced7b15931 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -336,20 +336,21 @@ static void dm_old_requeue_request(struct request *rq)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void dm_mq_requeue_request(struct request *rq)
+static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
 	blk_mq_requeue_request(rq);
+
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (!blk_queue_stopped(q))
-		blk_mq_kick_requeue_list(q);
+		blk_mq_delay_kick_requeue_list(q, msecs);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 static void dm_requeue_original_request(struct mapped_device *md,
-					struct request *rq)
+					struct request *rq, bool delay_requeue)
 {
 	int rw = rq_data_dir(rq);
 
@@ -359,7 +360,7 @@ static void dm_requeue_original_request(struct mapped_device *md,
 	if (!rq->q->mq_ops)
 		dm_old_requeue_request(rq);
 	else
-		dm_mq_requeue_request(rq);
+		dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
 
 	rq_completed(md, rw, false);
 }
@@ -389,7 +390,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
 		return;
 	else if (r == DM_ENDIO_REQUEUE)
 		/* The target wants to requeue the I/O */
-		dm_requeue_original_request(tio->md, tio->orig);
+		dm_requeue_original_request(tio->md, tio->orig, false);
 	else {
 		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
@@ -629,8 +630,8 @@ static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
 
 /*
  * Returns:
- * 0                : the request has been processed
- * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * DM_MAPIO_*       : the request has been processed as indicated
+ * DM_MAPIO_REQUEUE : the original request needs to be immediately requeued
  * < 0              : the request was completed due to failure
  */
 static int map_request(struct dm_rq_target_io *tio, struct request *rq,
@@ -643,6 +644,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 	if (tio->clone) {
 		clone = tio->clone;
 		r = ti->type->map_rq(ti, clone, &tio->info);
+		if (r == DM_MAPIO_DELAY_REQUEUE)
+			return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */
 	} else {
 		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
 		if (r < 0) {
@@ -650,9 +653,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 			dm_kill_unmapped_request(rq, r);
 			return r;
 		}
-		if (r != DM_MAPIO_REMAPPED)
-			return r;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+		if (r == DM_MAPIO_REMAPPED &&
+		    setup_clone(clone, rq, tio, GFP_ATOMIC)) {
 			/* -ENOMEM */
 			ti->type->release_clone_rq(clone);
 			return DM_MAPIO_REQUEUE;
@@ -671,7 +673,10 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		break;
 	case DM_MAPIO_REQUEUE:
 		/* The target wants to requeue the I/O */
-		dm_requeue_original_request(md, tio->orig);
+		break;
+	case DM_MAPIO_DELAY_REQUEUE:
+		/* The target wants to requeue the I/O after a delay */
+		dm_requeue_original_request(md, tio->orig, true);
 		break;
 	default:
 		if (r > 0) {
@@ -681,10 +686,9 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 
 		/* The target wants to complete the I/O */
 		dm_kill_unmapped_request(rq, r);
-		return r;
 	}
 
-	return 0;
+	return r;
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
@@ -727,7 +731,7 @@ static void map_tio_request(struct kthread_work *work)
 	struct mapped_device *md = tio->md;
 
 	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-		dm_requeue_original_request(md, rq);
+		dm_requeue_original_request(md, rq, false);
 }
 
 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
-- 
cgit v1.2.3


From fbc39b4ca3bed38c6d62c658af2157d2ec9efa03 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 13 Sep 2016 12:16:14 -0400
Subject: dm rq: reduce arguments passed to map_request() and
 dm_requeue_original_request()

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/md/dm-rq.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index dbced7b15931..8eefc0ad7a59 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -349,9 +349,10 @@ static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void dm_requeue_original_request(struct mapped_device *md,
-					struct request *rq, bool delay_requeue)
+static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_requeue)
 {
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
 	int rw = rq_data_dir(rq);
 
 	rq_end_stats(md, rq);
@@ -390,7 +391,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
 		return;
 	else if (r == DM_ENDIO_REQUEUE)
 		/* The target wants to requeue the I/O */
-		dm_requeue_original_request(tio->md, tio->orig, false);
+		dm_requeue_original_request(tio, false);
 	else {
 		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
@@ -634,11 +635,12 @@ static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
  * DM_MAPIO_REQUEUE : the original request needs to be immediately requeued
  * < 0              : the request was completed due to failure
  */
-static int map_request(struct dm_rq_target_io *tio, struct request *rq,
-		       struct mapped_device *md)
+static int map_request(struct dm_rq_target_io *tio)
 {
 	int r;
 	struct dm_target *ti = tio->ti;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
 	struct request *clone = NULL;
 
 	if (tio->clone) {
@@ -676,7 +678,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		break;
 	case DM_MAPIO_DELAY_REQUEUE:
 		/* The target wants to requeue the I/O after a delay */
-		dm_requeue_original_request(md, tio->orig, true);
+		dm_requeue_original_request(tio, true);
 		break;
 	default:
 		if (r > 0) {
@@ -727,11 +729,9 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 static void map_tio_request(struct kthread_work *work)
 {
 	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
-	struct request *rq = tio->orig;
-	struct mapped_device *md = tio->md;
 
-	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-		dm_requeue_original_request(md, rq, false);
+	if (map_request(tio) == DM_MAPIO_REQUEUE)
+		dm_requeue_original_request(tio, false);
 }
 
 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
@@ -917,7 +917,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	tio->ti = ti;
 
 	/* Direct call is fine since .queue_rq allows allocations */
-	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+	if (map_request(tio) == DM_MAPIO_REQUEUE) {
 		/* Undo dm_start_request() before requeuing */
 		rq_end_stats(md, rq);
 		rq_completed(md, rq_data_dir(rq), false);
-- 
cgit v1.2.3


From e0c107526960d1348cfe21f12bcfb3348fd7e8ab Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 14 Sep 2016 10:36:39 -0400
Subject: dm rq: introduce dm_mq_kick_requeue_list()

Make it possible for a request-based target to kick the DM device's
blk-mq request_queue's requeue_list.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/md/dm-rq.c | 17 +++++++++++++----
 drivers/md/dm-rq.h |  2 ++
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 8eefc0ad7a59..877b8f33620e 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -336,19 +336,28 @@ static void dm_old_requeue_request(struct request *rq)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs)
+static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs)
 {
-	struct request_queue *q = rq->q;
 	unsigned long flags;
 
-	blk_mq_requeue_request(rq);
-
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (!blk_queue_stopped(q))
 		blk_mq_delay_kick_requeue_list(q, msecs);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+void dm_mq_kick_requeue_list(struct mapped_device *md)
+{
+	__dm_mq_kick_requeue_list(dm_get_md_queue(md), 0);
+}
+EXPORT_SYMBOL(dm_mq_kick_requeue_list);
+
+static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs)
+{
+	blk_mq_requeue_request(rq);
+	__dm_mq_kick_requeue_list(rq->q, msecs);
+}
+
 static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_requeue)
 {
 	struct mapped_device *md = tio->md;
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index 9e6f0a3773d4..4da06cae7bad 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -55,6 +55,8 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md);
 void dm_start_queue(struct request_queue *q);
 void dm_stop_queue(struct request_queue *q);
 
+void dm_mq_kick_requeue_list(struct mapped_device *md);
+
 unsigned dm_get_reserved_rq_based_ios(void);
 
 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
-- 
cgit v1.2.3


From 7e48c768f44056a06bca596577c37f7721b53f0c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 14 Sep 2016 10:47:03 -0400
Subject: dm mpath: use dm_mq_kick_requeue_list()

When reinstating a path the blk-mq request_queue's requeue_list should
get kicked.  It makes sense to kick the requeue_list as part of the
existing hook (previously only used by bio-based support).

Rename process_queued_bios_list to process_queued_io_list.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/md/dm-mpath.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c777d38f4b11..f69715bf0575 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -680,9 +680,11 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
 	return __multipath_map_bio(m, bio, mpio);
 }
 
-static void process_queued_bios_list(struct multipath *m)
+static void process_queued_io_list(struct multipath *m)
 {
-	if (m->queue_mode == DM_TYPE_BIO_BASED)
+	if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
+		dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
+	else if (m->queue_mode == DM_TYPE_BIO_BASED)
 		queue_work(kmultipathd, &m->process_queued_bios);
 }
 
@@ -752,7 +754,7 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 
 	if (!queue_if_no_path) {
 		dm_table_run_md_queue_async(m->ti->table);
-		process_queued_bios_list(m);
+		process_queued_io_list(m);
 	}
 
 	return 0;
@@ -1304,7 +1306,7 @@ out:
 	spin_unlock_irqrestore(&m->lock, flags);
 	if (run_queue) {
 		dm_table_run_md_queue_async(m->ti->table);
-		process_queued_bios_list(m);
+		process_queued_io_list(m);
 	}
 
 	return r;
@@ -1502,7 +1504,7 @@ static void pg_init_done(void *data, int errors)
 	}
 	clear_bit(MPATHF_QUEUE_IO, &m->flags);
 
-	process_queued_bios_list(m);
+	process_queued_io_list(m);
 
 	/*
 	 * Wake up any thread waiting to suspend.
@@ -1937,7 +1939,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 			pg_init_all_paths(m);
 		dm_table_run_md_queue_async(m->ti->table);
-		process_queued_bios_list(m);
+		process_queued_io_list(m);
 	}
 
 	/*
-- 
cgit v1.2.3


From b88efd43f900d608560211a18a38d450f8192948 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 9 Sep 2016 19:26:19 -0400
Subject: dm mpath: delay the requeue of blk-mq requests while all paths down

Return DM_MAPIO_DELAY_REQUEUE from .clone_and_map_rq.  Also, return
false from .busy, if all paths are down, so that blk-mq requests get
mapped via .clone_and_map_rq -- which results in DM_MAPIO_DELAY_REQUEUE
being returned to dm-rq.

This change allows for a noticeable reduction in cpu utilization
(reduced kworker load) while all paths are down, e.g.:

system CPU idleness (as measured by fio's --idle-prof=system):
before: system: 86.58%
after:  system: 98.60%

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/md/dm-mpath.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f69715bf0575..f31fa1364abc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -550,9 +550,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		pgpath = choose_pgpath(m, nr_bytes);
 
 	if (!pgpath) {
-		if (!must_push_back_rq(m))
-			r = -EIO;	/* Failed */
-		return r;
+		if (must_push_back_rq(m))
+			return DM_MAPIO_DELAY_REQUEUE;
+		return -EIO;	/* Failed */
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
 		pg_init_all_paths(m);
@@ -1992,11 +1992,14 @@ static int multipath_busy(struct dm_target *ti)
 	struct priority_group *pg, *next_pg;
 	struct pgpath *pgpath;
 
-	/* pg_init in progress or no paths available */
-	if (atomic_read(&m->pg_init_in_progress) ||
-	    (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
+	/* pg_init in progress */
+	if (atomic_read(&m->pg_init_in_progress))
 		return true;
 
+	/* no paths available, for blk-mq: rely on IO mapping to delay requeue */
+	if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+		return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
+
 	/* Guess which priority_group will be used at next mapping time */
 	pg = lockless_dereference(m->current_pg);
 	next_pg = lockless_dereference(m->next_pg);
-- 
cgit v1.2.3


From dd6a77d99859ab963503e67372ed278fe8ceab26 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 15 Sep 2016 08:45:44 -0400
Subject: dm array: add dm_array_new()

dm_array_new() creates a new, populated array more efficiently than
starting with an empty one and resizing.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-array.c | 142 ++++++++++++++++++++++++++--------
 drivers/md/persistent-data/dm-array.h |  19 +++++
 2 files changed, 130 insertions(+), 31 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 431a03067d64..155180954cdf 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -277,6 +277,48 @@ static int insert_ablock(struct dm_array_info *info, uint64_t index,
 	return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
 }
 
+/*----------------------------------------------------------------*/
+
+static int __shadow_ablock(struct dm_array_info *info, dm_block_t b,
+			   struct dm_block **block, struct array_block **ab)
+{
+	int inc;
+	int r = dm_tm_shadow_block(info->btree_info.tm, b,
+				   &array_validator, block, &inc);
+	if (r)
+		return r;
+
+	*ab = dm_block_data(*block);
+	if (inc)
+		inc_ablock_entries(info, *ab);
+
+	return 0;
+}
+
+/*
+ * The shadow op will often be a noop.  Only insert if it really
+ * copied data.
+ */
+static int __reinsert_ablock(struct dm_array_info *info, unsigned index,
+			     struct dm_block *block, dm_block_t b,
+			     dm_block_t *root)
+{
+	int r = 0;
+
+	if (dm_block_location(block) != b) {
+		/*
+		 * dm_tm_shadow_block will have already decremented the old
+		 * block, but it is still referenced by the btree.  We
+		 * increment to stop the insert decrementing it below zero
+		 * when overwriting the old value.
+		 */
+		dm_tm_inc(info->btree_info.tm, b);
+		r = insert_ablock(info, index, block, root);
+	}
+
+	return r;
+}
+
 /*
  * Looks up an array block in the btree.  Then shadows it, and updates the
  * btree to point to this new shadow.  'root' is an input/output parameter
@@ -286,49 +328,21 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
 			 unsigned index, struct dm_block **block,
 			 struct array_block **ab)
 {
-	int r, inc;
+	int r;
 	uint64_t key = index;
 	dm_block_t b;
 	__le64 block_le;
 
-	/*
-	 * lookup
-	 */
 	r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
 	if (r)
 		return r;
 	b = le64_to_cpu(block_le);
 
-	/*
-	 * shadow
-	 */
-	r = dm_tm_shadow_block(info->btree_info.tm, b,
-			       &array_validator, block, &inc);
+	r = __shadow_ablock(info, b, block, ab);
 	if (r)
 		return r;
 
-	*ab = dm_block_data(*block);
-	if (inc)
-		inc_ablock_entries(info, *ab);
-
-	/*
-	 * Reinsert.
-	 *
-	 * The shadow op will often be a noop.  Only insert if it really
-	 * copied data.
-	 */
-	if (dm_block_location(*block) != b) {
-		/*
-		 * dm_tm_shadow_block will have already decremented the old
-		 * block, but it is still referenced by the btree.  We
-		 * increment to stop the insert decrementing it below zero
-		 * when overwriting the old value.
-		 */
-		dm_tm_inc(info->btree_info.tm, b);
-		r = insert_ablock(info, index, *block, root);
-	}
-
-	return r;
+	return __reinsert_ablock(info, index, *block, b, root);
 }
 
 /*
@@ -681,6 +695,72 @@ int dm_array_resize(struct dm_array_info *info, dm_block_t root,
 }
 EXPORT_SYMBOL_GPL(dm_array_resize);
 
+static int populate_ablock_with_values(struct dm_array_info *info, struct array_block *ab,
+				       value_fn fn, void *context, unsigned base, unsigned new_nr)
+{
+	int r;
+	unsigned i;
+	uint32_t nr_entries;
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	BUG_ON(le32_to_cpu(ab->nr_entries));
+	BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+
+	nr_entries = le32_to_cpu(ab->nr_entries);
+	for (i = 0; i < new_nr; i++) {
+		r = fn(base + i, element_at(info, ab, i), context);
+		if (r)
+			return r;
+
+		if (vt->inc)
+			vt->inc(vt->context, element_at(info, ab, i));
+	}
+
+	ab->nr_entries = cpu_to_le32(new_nr);
+	return 0;
+}
+
+int dm_array_new(struct dm_array_info *info, dm_block_t *root,
+		 uint32_t size, value_fn fn, void *context)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+	unsigned block_index, end_block, size_of_block, max_entries;
+
+	r = dm_array_empty(info, root);
+	if (r)
+		return r;
+
+	size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+	max_entries = calc_max_entries(info->value_type.size, size_of_block);
+	end_block = dm_div_up(size, max_entries);
+
+	for (block_index = 0; block_index != end_block; block_index++) {
+		r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
+		if (r)
+			break;
+
+		r = populate_ablock_with_values(info, ab, fn, context,
+						block_index * max_entries,
+						min(max_entries, size));
+		if (r) {
+			unlock_ablock(info, block);
+			break;
+		}
+
+		r = insert_ablock(info, block_index, block, root);
+		unlock_ablock(info, block);
+		if (r)
+			break;
+
+		size -= max_entries;
+	}
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_new);
+
 int dm_array_del(struct dm_array_info *info, dm_block_t root)
 {
 	return dm_btree_del(&info->btree_info, root);
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
index ea177d6fa58f..45ea3f6e1ded 100644
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -111,6 +111,25 @@ int dm_array_resize(struct dm_array_info *info, dm_block_t root,
 		    const void *value, dm_block_t *new_root)
 	__dm_written_to_disk(value);
 
+/*
+ * Creates a new array populated with values provided by a callback
+ * function.  This is more efficient than creating an empty array,
+ * resizing, and then setting values since that process incurs a lot of
+ * copying.
+ *
+ * Assumes 32bit values for now since it's only used by the cache hint
+ * array.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * size - the number of entries in the array
+ * fn - the callback
+ * context - passed to the callback
+ */
+typedef int (*value_fn)(uint32_t index, void *value_le, void *context);
+int dm_array_new(struct dm_array_info *info, dm_block_t *root,
+		 uint32_t size, value_fn fn, void *context);
+
 /*
  * Frees a whole array.  The value_type's decrement operation will be called
  * for all values in the array
-- 
cgit v1.2.3


From 4e781b498ee5008ede91362d91404a362e7a46b3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 15 Sep 2016 09:23:46 -0400
Subject: dm cache: speed up writing of the hint array

It's far quicker to always delete the hint array and recreate with
dm_array_new() because we avoid the copying caused by mutation.

Also simplifies the policy interface, replacing the walk_hints() with
the simpler get_hint().

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-metadata.c        | 80 ++++++++++++-----------------------
 drivers/md/dm-cache-policy-cleaner.c  |  2 +-
 drivers/md/dm-cache-policy-internal.h |  6 +--
 drivers/md/dm-cache-policy-smq.c      | 38 +++--------------
 drivers/md/dm-cache-policy.h          | 10 ++---
 5 files changed, 42 insertions(+), 94 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 3970cda10080..a60f10a0ee0a 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1368,10 +1368,24 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
 
 /*----------------------------------------------------------------*/
 
-static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+static int get_hint(uint32_t index, void *value_le, void *context)
+{
+	uint32_t value;
+	struct dm_cache_policy *policy = context;
+
+	value = policy_get_hint(policy, to_cblock(index));
+	*((__le32 *) value_le) = cpu_to_le32(value);
+
+	return 0;
+}
+
+/*
+ * It's quicker to always delete the hint array, and recreate with
+ * dm_array_new().
+ */
+static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
 {
 	int r;
-	__le32 value;
 	size_t hint_size;
 	const char *policy_name = dm_cache_policy_get_name(policy);
 	const unsigned *policy_version = dm_cache_policy_get_version(policy);
@@ -1380,63 +1394,23 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
 	    (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
 		return -EINVAL;
 
-	if (!policy_unchanged(cmd, policy)) {
-		strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
-		memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
+	strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+	memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
 
-		hint_size = dm_cache_policy_get_hint_size(policy);
-		if (!hint_size)
-			return 0; /* short-circuit hints initialization */
-		cmd->policy_hint_size = hint_size;
+	hint_size = dm_cache_policy_get_hint_size(policy);
+	if (!hint_size)
+		return 0; /* short-circuit hints initialization */
+	cmd->policy_hint_size = hint_size;
 
-		if (cmd->hint_root) {
-			r = dm_array_del(&cmd->hint_info, cmd->hint_root);
-			if (r)
-				return r;
-		}
-
-		r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
+	if (cmd->hint_root) {
+		r = dm_array_del(&cmd->hint_info, cmd->hint_root);
 		if (r)
 			return r;
-
-		value = cpu_to_le32(0);
-		__dm_bless_for_disk(&value);
-		r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
-				    from_cblock(cmd->cache_blocks),
-				    &value, &cmd->hint_root);
-		if (r)
-			return r;
-	}
-
-	return 0;
-}
-
-static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint)
-{
-	struct dm_cache_metadata *cmd = context;
-	__le32 value = cpu_to_le32(hint);
-	int r;
-
-	__dm_bless_for_disk(&value);
-
-	r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
-			       from_cblock(cblock), &value, &cmd->hint_root);
-	cmd->changed = true;
-
-	return r;
-}
-
-static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
-{
-	int r;
-
-	r = begin_hints(cmd, policy);
-	if (r) {
-		DMERR("begin_hints failed");
-		return r;
 	}
 
-	return policy_walk_mappings(policy, save_hint, cmd);
+	return dm_array_new(&cmd->hint_info, &cmd->hint_root,
+			    from_cblock(cmd->cache_blocks),
+			    get_hint, policy);
 }
 
 int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index 14aaaf059f06..2e8a8f1d8358 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -395,7 +395,7 @@ static void init_policy_functions(struct policy *p)
 	p->policy.set_dirty = wb_set_dirty;
 	p->policy.clear_dirty = wb_clear_dirty;
 	p->policy.load_mapping = wb_load_mapping;
-	p->policy.walk_mappings = NULL;
+	p->policy.get_hint = NULL;
 	p->policy.remove_mapping = wb_remove_mapping;
 	p->policy.writeback_work = wb_writeback_work;
 	p->policy.force_mapping = wb_force_mapping;
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 2816018faa7f..808ee0e2b2c4 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -48,10 +48,10 @@ static inline int policy_load_mapping(struct dm_cache_policy *p,
 	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
 }
 
-static inline int policy_walk_mappings(struct dm_cache_policy *p,
-				      policy_walk_fn fn, void *context)
+static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
+				       dm_cblock_t cblock)
 {
-	return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
+	return p->get_hint ? p->get_hint(p, cblock) : 0;
 }
 
 static inline int policy_writeback_work(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index cf48a617a3a4..f3cec4e6333c 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1375,41 +1375,15 @@ static int smq_load_mapping(struct dm_cache_policy *p,
 	return 0;
 }
 
-static int smq_save_hints(struct smq_policy *mq, struct queue *q,
-			  policy_walk_fn fn, void *context)
-{
-	int r;
-	unsigned level;
-	struct entry *e;
-
-	for (level = 0; level < q->nr_levels; level++)
-		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
-			if (!e->sentinel) {
-				r = fn(context, infer_cblock(mq, e),
-				       e->oblock, e->level);
-				if (r)
-					return r;
-			}
-		}
-
-	return 0;
-}
-
-static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
-			     void *context)
+static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
 {
 	struct smq_policy *mq = to_smq_policy(p);
-	int r = 0;
+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
 
-	/*
-	 * We don't need to lock here since this method is only called once
-	 * the IO has stopped.
-	 */
-	r = smq_save_hints(mq, &mq->clean, fn, context);
-	if (!r)
-		r = smq_save_hints(mq, &mq->dirty, fn, context);
+	if (!e->allocated)
+		return 0;
 
-	return r;
+	return e->level;
 }
 
 static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
@@ -1616,7 +1590,7 @@ static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
 	mq->policy.set_dirty = smq_set_dirty;
 	mq->policy.clear_dirty = smq_clear_dirty;
 	mq->policy.load_mapping = smq_load_mapping;
-	mq->policy.walk_mappings = smq_walk_mappings;
+	mq->policy.get_hint = smq_get_hint;
 	mq->policy.remove_mapping = smq_remove_mapping;
 	mq->policy.remove_cblock = smq_remove_cblock;
 	mq->policy.writeback_work = smq_writeback_work;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 05db56eedb6a..aa10b1493f34 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -90,9 +90,6 @@ struct policy_result {
 	dm_cblock_t cblock;	/* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
 };
 
-typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
-			      dm_oblock_t oblock, uint32_t hint);
-
 /*
  * The cache policy object.  Just a bunch of methods.  It is envisaged that
  * this structure will be embedded in a bigger, policy specific structure
@@ -158,8 +155,11 @@ struct dm_cache_policy {
 	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
 			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
 
-	int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
-			     void *context);
+	/*
+	 * Gets the hint for a given cblock.  Called in a single threaded
+	 * context.  So no locking required.
+	 */
+	uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
 
 	/*
 	 * Override functions used on the error paths of the core target.
-- 
cgit v1.2.3


From 9d1b404cbc3f990a4035dcf7ddd37adac2a99b3f Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 15 Sep 2016 09:36:24 -0400
Subject: dm cache policy smq: distribute entries to random levels when
 switching to smq

For smq the 32 bit 'hint' stores the multiqueue level that the entry
should be stored in.  If a different policy has been used previously,
and then switched to smq, the hints will be invalid.  In which case we
used to put all entries in the bottom level of the multiqueue, and then
redistribute.  Redistribution is faster if we put entries with invalid
hints in random levels initially.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-smq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index f3cec4e6333c..c33f4a6e1d7d 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1359,6 +1359,11 @@ static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
 	spin_unlock_irqrestore(&mq->lock, flags);
 }
 
+static unsigned random_level(dm_cblock_t cblock)
+{
+	return hash_32_generic(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+}
+
 static int smq_load_mapping(struct dm_cache_policy *p,
 			    dm_oblock_t oblock, dm_cblock_t cblock,
 			    uint32_t hint, bool hint_valid)
@@ -1369,7 +1374,7 @@ static int smq_load_mapping(struct dm_cache_policy *p,
 	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
 	e->oblock = oblock;
 	e->dirty = false;	/* this gets corrected in a minute */
-	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
+	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
 	push(mq, e);
 
 	return 0;
-- 
cgit v1.2.3


From 7d111c81fa29041c730010450618917fb05cab62 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 15 Sep 2016 10:49:24 -0400
Subject: dm btree: introduce cursor api

This uses prefetching to speed up iteration through a btree.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-btree.c | 162 ++++++++++++++++++++++++++++++++++
 drivers/md/persistent-data/dm-btree.h |  35 ++++++++
 2 files changed, 197 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 2cc1877804c2..20a40329d84a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -994,3 +994,165 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
 	return walk_node(info, root, fn, context);
 }
 EXPORT_SYMBOL_GPL(dm_btree_walk);
+
+/*----------------------------------------------------------------*/
+
+static void prefetch_values(struct dm_btree_cursor *c)
+{
+	unsigned i, nr;
+	__le64 value_le;
+	struct cursor_node *n = c->nodes + c->depth - 1;
+	struct btree_node *bn = dm_block_data(n->b);
+	struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
+
+	BUG_ON(c->info->value_type.size != sizeof(value_le));
+
+	nr = le32_to_cpu(bn->header.nr_entries);
+	for (i = 0; i < nr; i++) {
+		memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
+		dm_bm_prefetch(bm, le64_to_cpu(value_le));
+	}
+}
+
+static bool leaf_node(struct dm_btree_cursor *c)
+{
+	struct cursor_node *n = c->nodes + c->depth - 1;
+	struct btree_node *bn = dm_block_data(n->b);
+
+	return le32_to_cpu(bn->header.flags) & LEAF_NODE;
+}
+
+static int push_node(struct dm_btree_cursor *c, dm_block_t b)
+{
+	int r;
+	struct cursor_node *n = c->nodes + c->depth;
+
+	if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
+		DMERR("couldn't push cursor node, stack depth too high");
+		return -EINVAL;
+	}
+
+	r = bn_read_lock(c->info, b, &n->b);
+	if (r)
+		return r;
+
+	n->index = 0;
+	c->depth++;
+
+	if (c->prefetch_leaves || !leaf_node(c))
+		prefetch_values(c);
+
+	return 0;
+}
+
+static void pop_node(struct dm_btree_cursor *c)
+{
+	c->depth--;
+	unlock_block(c->info, c->nodes[c->depth].b);
+}
+
+static int inc_or_backtrack(struct dm_btree_cursor *c)
+{
+	struct cursor_node *n;
+	struct btree_node *bn;
+
+	for (;;) {
+		if (!c->depth)
+			return -ENODATA;
+
+		n = c->nodes + c->depth - 1;
+		bn = dm_block_data(n->b);
+
+		n->index++;
+		if (n->index < le32_to_cpu(bn->header.nr_entries))
+			break;
+
+		pop_node(c);
+	}
+
+	return 0;
+}
+
+static int find_leaf(struct dm_btree_cursor *c)
+{
+	int r = 0;
+	struct cursor_node *n;
+	struct btree_node *bn;
+	__le64 value_le;
+
+	for (;;) {
+		n = c->nodes + c->depth - 1;
+		bn = dm_block_data(n->b);
+
+		if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
+			break;
+
+		memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
+		r = push_node(c, le64_to_cpu(value_le));
+		if (r) {
+			DMERR("push_node failed");
+			break;
+		}
+	}
+
+	if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
+		return -ENODATA;
+
+	return r;
+}
+
+int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
+			  bool prefetch_leaves, struct dm_btree_cursor *c)
+{
+	int r;
+
+	c->info = info;
+	c->root = root;
+	c->depth = 0;
+	c->prefetch_leaves = prefetch_leaves;
+
+	r = push_node(c, root);
+	if (r)
+		return r;
+
+	return find_leaf(c);
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
+
+void dm_btree_cursor_end(struct dm_btree_cursor *c)
+{
+	while (c->depth)
+		pop_node(c);
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
+
+int dm_btree_cursor_next(struct dm_btree_cursor *c)
+{
+	int r = inc_or_backtrack(c);
+	if (!r) {
+		r = find_leaf(c);
+		if (r)
+			DMERR("find_leaf failed");
+	}
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
+
+int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
+{
+	if (c->depth) {
+		struct cursor_node *n = c->nodes + c->depth - 1;
+		struct btree_node *bn = dm_block_data(n->b);
+
+		if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
+			return -EINVAL;
+
+		*key = le64_to_cpu(*key_ptr(bn, n->index));
+		memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
+		return 0;
+
+	} else
+		return -ENODATA;
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index c74301fa5a37..db9bd26adf31 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -176,4 +176,39 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
 		  int (*fn)(void *context, uint64_t *keys, void *leaf),
 		  void *context);
 
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Cursor API.  This does not follow the rolling lock convention.  Since we
+ * know the order that values are required we can issue prefetches to speed
+ * up iteration.  Use on a single level btree only.
+ */
+#define DM_BTREE_CURSOR_MAX_DEPTH 16
+
+struct cursor_node {
+	struct dm_block *b;
+	unsigned index;
+};
+
+struct dm_btree_cursor {
+	struct dm_btree_info *info;
+	dm_block_t root;
+
+	bool prefetch_leaves;
+	unsigned depth;
+	struct cursor_node nodes[DM_BTREE_CURSOR_MAX_DEPTH];
+};
+
+/*
+ * Creates a fresh cursor.  If prefetch_leaves is set then it is assumed
+ * the btree contains block indexes that will be prefetched.  The cursor is
+ * quite large, so you probably don't want to put it on the stack.
+ */
+int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
+			  bool prefetch_leaves, struct dm_btree_cursor *c);
+void dm_btree_cursor_end(struct dm_btree_cursor *c);
+int dm_btree_cursor_next(struct dm_btree_cursor *c);
+int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le);
+
 #endif	/* _LINUX_DM_BTREE_H */
-- 
cgit v1.2.3


From fdd1315aa5f022fe6574efdc2d9535f75a0ee255 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 15 Sep 2016 11:11:42 -0400
Subject: dm array: introduce cursor api

More efficient way to iterate an array due to prefetching (makes use of
the new dm_btree_cursor_* api).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-array.c | 86 +++++++++++++++++++++++++++++++++++
 drivers/md/persistent-data/dm-array.h | 33 ++++++++++++++
 2 files changed, 119 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 155180954cdf..e83047cbb2da 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -899,3 +899,89 @@ int dm_array_walk(struct dm_array_info *info, dm_block_t root,
 EXPORT_SYMBOL_GPL(dm_array_walk);
 
 /*----------------------------------------------------------------*/
+
+static int load_ablock(struct dm_array_cursor *c)
+{
+	int r;
+	__le64 value_le;
+	uint64_t key;
+
+	if (c->block)
+		unlock_ablock(c->info, c->block);
+
+	c->block = NULL;
+	c->ab = NULL;
+	c->index = 0;
+
+	r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le);
+	if (r) {
+		DMERR("dm_btree_cursor_get_value failed");
+		dm_btree_cursor_end(&c->cursor);
+
+	} else {
+		r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab);
+		if (r) {
+			DMERR("get_ablock failed");
+			dm_btree_cursor_end(&c->cursor);
+		}
+	}
+
+	return r;
+}
+
+int dm_array_cursor_begin(struct dm_array_info *info, dm_block_t root,
+			  struct dm_array_cursor *c)
+{
+	int r;
+
+	memset(c, 0, sizeof(*c));
+	c->info = info;
+	r = dm_btree_cursor_begin(&info->btree_info, root, true, &c->cursor);
+	if (r) {
+		DMERR("couldn't create btree cursor");
+		return r;
+	}
+
+	return load_ablock(c);
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_begin);
+
+void dm_array_cursor_end(struct dm_array_cursor *c)
+{
+	if (c->block) {
+		unlock_ablock(c->info, c->block);
+		dm_btree_cursor_end(&c->cursor);
+	}
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_end);
+
+int dm_array_cursor_next(struct dm_array_cursor *c)
+{
+	int r;
+
+	if (!c->block)
+		return -ENODATA;
+
+	c->index++;
+
+	if (c->index >= le32_to_cpu(c->ab->nr_entries)) {
+		r = dm_btree_cursor_next(&c->cursor);
+		if (r)
+			return r;
+
+		r = load_ablock(c);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_next);
+
+void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le)
+{
+	*value_le = element_at(c->info, c->ab, c->index);
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_get_value);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
index 45ea3f6e1ded..27ee49a55473 100644
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -182,4 +182,37 @@ int dm_array_walk(struct dm_array_info *info, dm_block_t root,
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Cursor api.
+ *
+ * This lets you iterate through all the entries in an array efficiently
+ * (it will preload metadata).
+ *
+ * I'm using a cursor, rather than a walk function with a callback because
+ * the cache target needs to iterate both the mapping and hint arrays in
+ * unison.
+ */
+struct dm_array_cursor {
+	struct dm_array_info *info;
+	struct dm_btree_cursor cursor;
+
+	struct dm_block *block;
+	struct array_block *ab;
+	unsigned index;
+};
+
+int dm_array_cursor_begin(struct dm_array_info *info,
+			  dm_block_t root, struct dm_array_cursor *c);
+void dm_array_cursor_end(struct dm_array_cursor *c);
+
+uint32_t dm_array_cursor_index(struct dm_array_cursor *c);
+int dm_array_cursor_next(struct dm_array_cursor *c);
+
+/*
+ * value_le is only valid while the cursor points at the current value.
+ */
+void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le);
+
+/*----------------------------------------------------------------*/
+
 #endif	/* _LINUX_DM_ARRAY_H */
-- 
cgit v1.2.3


From f177940a80917044f0ea6fd18ea1bfbdff012050 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 20 Sep 2016 06:16:00 -0400
Subject: dm cache metadata: switch to using the new cursor api for loading
 metadata

This change offers a pretty significant performance improvement.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-metadata.c | 103 ++++++++++++++++++++++++++++++++---------
 1 file changed, 80 insertions(+), 23 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index a60f10a0ee0a..695577812cf6 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -140,6 +140,13 @@ struct dm_cache_metadata {
 	 * the device.
 	 */
 	bool fail_io:1;
+
+	/*
+	 * These structures are used when loading metadata.  They're too
+	 * big to put on the stack.
+	 */
+	struct dm_array_cursor mapping_cursor;
+	struct dm_array_cursor hint_cursor;
 };
 
 /*-------------------------------------------------------------------
@@ -1171,31 +1178,37 @@ static bool hints_array_available(struct dm_cache_metadata *cmd,
 		hints_array_initialized(cmd);
 }
 
-static int __load_mapping(void *context, uint64_t cblock, void *leaf)
+static int __load_mapping(struct dm_cache_metadata *cmd,
+			  uint64_t cb, bool hints_valid,
+			  struct dm_array_cursor *mapping_cursor,
+			  struct dm_array_cursor *hint_cursor,
+			  load_mapping_fn fn, void *context)
 {
 	int r = 0;
-	bool dirty;
-	__le64 value;
-	__le32 hint_value = 0;
+
+	__le64 mapping;
+	__le32 hint = 0;
+
+	__le64 *mapping_value_le;
+	__le32 *hint_value_le;
+
 	dm_oblock_t oblock;
 	unsigned flags;
-	struct thunk *thunk = context;
-	struct dm_cache_metadata *cmd = thunk->cmd;
 
-	memcpy(&value, leaf, sizeof(value));
-	unpack_value(value, &oblock, &flags);
+	dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
+	memcpy(&mapping, mapping_value_le, sizeof(mapping));
+	unpack_value(mapping, &oblock, &flags);
 
 	if (flags & M_VALID) {
-		if (thunk->hints_valid) {
-			r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
-					       cblock, &hint_value);
-			if (r && r != -ENODATA)
-				return r;
+		if (hints_valid) {
+			dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
+			memcpy(&hint, hint_value_le, sizeof(hint));
 		}
 
-		dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
-		r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
-			      dirty, le32_to_cpu(hint_value), thunk->hints_valid);
+		r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
+		       le32_to_cpu(hint), hints_valid);
+		if (r)
+			DMERR("policy couldn't load cblock");
 	}
 
 	return r;
@@ -1205,16 +1218,60 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
 			   struct dm_cache_policy *policy,
 			   load_mapping_fn fn, void *context)
 {
-	struct thunk thunk;
+	int r;
+	uint64_t cb;
 
-	thunk.fn = fn;
-	thunk.context = context;
+	bool hints_valid = hints_array_available(cmd, policy);
 
-	thunk.cmd = cmd;
-	thunk.respect_dirty_flags = cmd->clean_when_opened;
-	thunk.hints_valid = hints_array_available(cmd, policy);
+	if (from_cblock(cmd->cache_blocks) == 0)
+		/* Nothing to do */
+		return 0;
 
-	return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
+	r = dm_array_cursor_begin(&cmd->info, cmd->root, &cmd->mapping_cursor);
+	if (r)
+		return r;
+
+	if (hints_valid) {
+		r = dm_array_cursor_begin(&cmd->hint_info, cmd->hint_root, &cmd->hint_cursor);
+		if (r) {
+			dm_array_cursor_end(&cmd->mapping_cursor);
+			return r;
+		}
+	}
+
+	for (cb = 0; ; cb++) {
+		r = __load_mapping(cmd, cb, hints_valid,
+				   &cmd->mapping_cursor, &cmd->hint_cursor,
+				   fn, context);
+		if (r)
+			goto out;
+
+		/*
+		 * We need to break out before we move the cursors.
+		 */
+		if (cb >= (from_cblock(cmd->cache_blocks) - 1))
+			break;
+
+		r = dm_array_cursor_next(&cmd->mapping_cursor);
+		if (r) {
+			DMERR("dm_array_cursor_next for mapping failed");
+			goto out;
+		}
+
+		if (hints_valid) {
+			r = dm_array_cursor_next(&cmd->hint_cursor);
+			if (r) {
+				DMERR("dm_array_cursor_next for hint failed");
+				goto out;
+			}
+		}
+	}
+out:
+	dm_array_cursor_end(&cmd->mapping_cursor);
+	if (hints_valid)
+		dm_array_cursor_end(&cmd->hint_cursor);
+
+	return r;
 }
 
 int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
-- 
cgit v1.2.3


From f659b10087daaf4ce0087c3f6aec16746be9628f Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabinv@axis.com>
Date: Wed, 21 Sep 2016 16:22:29 +0200
Subject: dm crypt: fix crash on exit

As the documentation for kthread_stop() says, "if threadfn() may call
do_exit() itself, the caller must ensure task_struct can't go away".
dm-crypt does not ensure this and therefore crashes when crypt_dtr()
calls kthread_stop().  The crash is trivially reproducible by adding a
delay before the call to kthread_stop() and just opening and closing a
dm-crypt device.

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU: 0 PID: 533 Comm: cryptsetup Not tainted 4.8.0-rc7+ #7
 task: ffff88003bd0df40 task.stack: ffff8800375b4000
 RIP: 0010: kthread_stop+0x52/0x300
 Call Trace:
  crypt_dtr+0x77/0x120
  dm_table_destroy+0x6f/0x120
  __dm_destroy+0x130/0x250
  dm_destroy+0x13/0x20
  dev_remove+0xe6/0x120
  ? dev_suspend+0x250/0x250
  ctl_ioctl+0x1fc/0x530
  ? __lock_acquire+0x24f/0x1b10
  dm_ctl_ioctl+0x13/0x20
  do_vfs_ioctl+0x91/0x6a0
  ? ____fput+0xe/0x10
  ? entry_SYSCALL_64_fastpath+0x5/0xbd
  ? trace_hardirqs_on_caller+0x151/0x1e0
  SyS_ioctl+0x41/0x70
  entry_SYSCALL_64_fastpath+0x1f/0xbd

This problem was introduced by bcbd94ff481e ("dm crypt: fix a possible
hang due to race condition on exit").

Looking at the description of that patch (excerpted below), it seems
like the problem it addresses can be solved by just using
set_current_state instead of __set_current_state, since we obviously
need the memory barrier.

| dm crypt: fix a possible hang due to race condition on exit
|
| A kernel thread executes __set_current_state(TASK_INTERRUPTIBLE),
| __add_wait_queue, spin_unlock_irq and then tests kthread_should_stop().
| It is possible that the processor reorders memory accesses so that
| kthread_should_stop() is executed before __set_current_state().  If
| such reordering happens, there is a possible race on thread
| termination: [...]

So this patch just reverts the aforementioned patch and changes the
__set_current_state(TASK_INTERRUPTIBLE) to set_current_state(...).  This
fixes the crash and should also fix the potential hang.

Fixes: bcbd94ff481e ("dm crypt: fix a possible hang due to race condition on exit")
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9ba0f0724d28..bcba462a7d14 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -113,8 +113,7 @@ struct iv_tcw_private {
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
-	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
-	     DM_CRYPT_EXIT_THREAD};
+	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
 
 /*
  * The fields in here must be read only after initialization.
@@ -1207,18 +1206,20 @@ continue_locked:
 		if (!RB_EMPTY_ROOT(&cc->write_tree))
 			goto pop_from_list;
 
-		if (unlikely(test_bit(DM_CRYPT_EXIT_THREAD, &cc->flags))) {
-			spin_unlock_irq(&cc->write_thread_wait.lock);
-			break;
-		}
-
-		__set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 		__add_wait_queue(&cc->write_thread_wait, &wait);
 
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
+		if (unlikely(kthread_should_stop())) {
+			set_task_state(current, TASK_RUNNING);
+			remove_wait_queue(&cc->write_thread_wait, &wait);
+			break;
+		}
+
 		schedule();
 
+		set_task_state(current, TASK_RUNNING);
 		spin_lock_irq(&cc->write_thread_wait.lock);
 		__remove_wait_queue(&cc->write_thread_wait, &wait);
 		goto continue_locked;
@@ -1533,13 +1534,8 @@ static void crypt_dtr(struct dm_target *ti)
 	if (!cc)
 		return;
 
-	if (cc->write_thread) {
-		spin_lock_irq(&cc->write_thread_wait.lock);
-		set_bit(DM_CRYPT_EXIT_THREAD, &cc->flags);
-		wake_up_locked(&cc->write_thread_wait);
-		spin_unlock_irq(&cc->write_thread_wait.lock);
+	if (cc->write_thread)
 		kthread_stop(cc->write_thread);
-	}
 
 	if (cc->io_queue)
 		destroy_workqueue(cc->io_queue);
-- 
cgit v1.2.3


From 7cd326747f46ffe1c7bff5682e97dfbcb98990ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 13 Sep 2016 10:45:20 +0200
Subject: dm bufio: remove dm_bufio_cond_resched()

Use cond_resched() like everybody else.

Mikulas explained why dm_bufio_cond_resched() was introduced to begin
with (hopefully cond_resched can be improved accordingly) here:
https://www.redhat.com/archives/dm-devel/2016-September/msg00112.html

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com> # added last comment in header
---
 drivers/md/dm-bufio.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 6571c81465e1..27f4a802f83c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -191,19 +191,6 @@ static void dm_bufio_unlock(struct dm_bufio_client *c)
 	mutex_unlock(&c->lock);
 }
 
-/*
- * FIXME Move to sched.h?
- */
-#ifdef CONFIG_PREEMPT_VOLUNTARY
-#  define dm_bufio_cond_resched()		\
-do {						\
-	if (unlikely(need_resched()))		\
-		_cond_resched();		\
-} while (0)
-#else
-#  define dm_bufio_cond_resched()                do { } while (0)
-#endif
-
 /*----------------------------------------------------------------*/
 
 /*
@@ -741,7 +728,7 @@ static void __flush_write_list(struct list_head *write_list)
 			list_entry(write_list->next, struct dm_buffer, write_list);
 		list_del(&b->write_list);
 		submit_io(b, WRITE, b->block, write_endio);
-		dm_bufio_cond_resched();
+		cond_resched();
 	}
 	blk_finish_plug(&plug);
 }
@@ -780,7 +767,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 			__unlink_buffer(b);
 			return b;
 		}
-		dm_bufio_cond_resched();
+		cond_resched();
 	}
 
 	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
@@ -791,7 +778,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 			__unlink_buffer(b);
 			return b;
 		}
-		dm_bufio_cond_resched();
+		cond_resched();
 	}
 
 	return NULL;
@@ -923,7 +910,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
 			return;
 
 		__write_dirty_buffer(b, write_list);
-		dm_bufio_cond_resched();
+		cond_resched();
 	}
 }
 
@@ -973,7 +960,7 @@ static void __check_watermark(struct dm_bufio_client *c,
 			return;
 
 		__free_buffer_wake(b);
-		dm_bufio_cond_resched();
+		cond_resched();
 	}
 
 	if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
@@ -1170,7 +1157,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 				submit_io(b, READ, b->block, read_endio);
 			dm_bufio_release(b);
 
-			dm_bufio_cond_resched();
+			cond_resched();
 
 			if (!n_blocks)
 				goto flush_plug;
@@ -1291,7 +1278,7 @@ again:
 		    !test_bit(B_WRITING, &b->state))
 			__relink_lru(b, LIST_CLEAN);
 
-		dm_bufio_cond_resched();
+		cond_resched();
 
 		/*
 		 * If we dropped the lock, the list is no longer consistent,
@@ -1574,7 +1561,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
 				freed++;
 			if (!--nr_to_scan || ((count - freed) <= retain_target))
 				return freed;
-			dm_bufio_cond_resched();
+			cond_resched();
 		}
 	}
 	return freed;
@@ -1808,7 +1795,7 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
 		if (__try_evict_buffer(b, 0))
 			count--;
 
-		dm_bufio_cond_resched();
+		cond_resched();
 	}
 
 	dm_bufio_unlock(c);
-- 
cgit v1.2.3


From 8ff232c1a819c2e98d85974a3bff0b7b8e2970ed Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 15 Jul 2015 13:23:24 +0200
Subject: dm mpath: always return reservation conflict without failing over

If dm-mpath encounters an reservation conflict it should not fail the
path (as communication with the target is not affected) but should
rather retry on another path.  However, in doing so we might be inducing
a ping-pong between paths, with no guarantee of any forward progress.
And arguably a reservation conflict is an unexpected error, so we should
be passing it upwards to allow the application to take appropriate
steps.

This change resolves a show-stopper problem seen with the pNFS SCSI
layout because it is trivial to hit reservation conflict based failover
loops without it.

Doubts were raised about the implications of this change relative to
products like IBM's SVC.  But there is little point withholding a fix
for Linux because a proprietary product may or may not have some issues
in its implementation of how it interfaces with Linux.  In the future,
if there is glaring evidence that this change is certainly problematic
we can revisit it.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Acked-by: Christoph Hellwig <hch@lst.de>
Tested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com> # tweaked header
---
 drivers/md/dm-mpath.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f31fa1364abc..e477af8596e2 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1530,6 +1530,14 @@ static void activate_path(struct work_struct *work)
 static int noretry_error(int error)
 {
 	switch (error) {
+	case -EBADE:
+		/*
+		 * EBADE signals an reservation conflict.
+		 * We shouldn't fail the path here as we can communicate with
+		 * the target.  We should failover to the next path, but in
+		 * doing so we might be causing a ping-pong between paths.
+		 * So just return the reservation conflict error.
+		 */
 	case -EOPNOTSUPP:
 	case -EREMOTEIO:
 	case -EILSEQ:
@@ -1574,9 +1582,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
 		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
 			if (!must_push_back_rq(m))
 				r = -EIO;
-		} else {
-			if (error == -EBADE)
-				r = error;
 		}
 	}
 
@@ -1625,9 +1630,6 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
 			if (!must_push_back_bio(m))
 				return -EIO;
 			return DM_ENDIO_REQUEUE;
-		} else {
-			if (error == -EBADE)
-				return error;
 		}
 	}
 
-- 
cgit v1.2.3