From 5261b85e586afe6ebe54e16e0a8acc32fc6d4902 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 13 Mar 2014 11:23:39 +1030
Subject: virtio_blk: don't crash, report error if virtqueue is broken.

A bad implementation of virtio might cause us to mark the virtqueue
broken: we'll dev_err() in that case, and the device is useless, but
let's not BUG_ON().

ENOMEM or ENOSPC implies the ring is full, and we should try again
later (-ENOMEM is documented to happen, but doesn't, as we fall
through to ENOSPC).

EIO means it's broken.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/virtio_blk.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index b1cb3f4c4db4..a2db9ed288f2 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -158,6 +158,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 	unsigned long flags;
 	unsigned int num;
 	const bool last = (req->cmd_flags & REQ_END) != 0;
+	int err;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
@@ -198,11 +199,16 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 	}
 
 	spin_lock_irqsave(&vblk->vq_lock, flags);
-	if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) {
+	err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
+	if (err) {
 		virtqueue_kick(vblk->vq);
 		spin_unlock_irqrestore(&vblk->vq_lock, flags);
 		blk_mq_stop_hw_queue(hctx);
-		return BLK_MQ_RQ_QUEUE_BUSY;
+		/* Out of mem doesn't actually happen, since we fall back
+		 * to direct descriptors */
+		if (err == -ENOMEM || err == -ENOSPC)
+			return BLK_MQ_RQ_QUEUE_BUSY;
+		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
 
 	if (last)
-- 
cgit v1.2.3


From fc4324b4597c4eb8907207e82f9a6acec84dd335 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 19 Mar 2014 17:08:24 +1030
Subject: virtio-blk: base queue-depth on virtqueue ringsize or module param

Venkatash spake thus:

  virtio-blk set the default queue depth to 64 requests, which was
  insufficient for high-IOPS devices. Instead set the blk-queue depth to
  the device's virtqueue depth divided by two (each I/O requires at least
  two VQ entries).

But behold, Ted added a module parameter:

  Also allow the queue depth to be something which can be set at module
  load time or via a kernel boot-time parameter, for
  testing/benchmarking purposes.

And I rewrote it substantially, mainly to take
VIRTIO_RING_F_INDIRECT_DESC into account.

As QEMU sets the vq size for PCI to 128, Venkatash's patch wouldn't
have made a change.  This version does (since QEMU also offers
VIRTIO_RING_F_INDIRECT_DESC.

Inspired-by: "Theodore Ts'o" <tytso@mit.edu>
Based-on-the-true-story-of: Venkatesh Srinivas <venkateshs@google.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: virtio-dev@lists.oasis-open.org
Cc: virtualization@lists.linux-foundation.org
Cc: Frank Swiderski <fes@google.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/virtio_blk.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index a2db9ed288f2..196222271a50 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -491,10 +491,11 @@ static struct blk_mq_ops virtio_mq_ops = {
 static struct blk_mq_reg virtio_mq_reg = {
 	.ops		= &virtio_mq_ops,
 	.nr_hw_queues	= 1,
-	.queue_depth	= 64,
+	.queue_depth	= 0, /* Set in virtblk_probe */
 	.numa_node	= NUMA_NO_NODE,
 	.flags		= BLK_MQ_F_SHOULD_MERGE,
 };
+module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
 
 static void virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
 			     struct request *rq, unsigned int nr)
@@ -558,6 +559,13 @@ static int virtblk_probe(struct virtio_device *vdev)
 		goto out_free_vq;
 	}
 
+	/* Default queue sizing is to fill the ring. */
+	if (!virtio_mq_reg.queue_depth) {
+		virtio_mq_reg.queue_depth = vblk->vq->num_free;
+		/* ... but without indirect descs, we use 2 descs per req */
+		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+			virtio_mq_reg.queue_depth /= 2;
+	}
 	virtio_mq_reg.cmd_size =
 		sizeof(struct virtblk_req) +
 		sizeof(struct scatterlist) * sg_elems;
-- 
cgit v1.2.3


From 671a6018db1d359254c8201704cf967efb42fc25 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Thu, 13 Feb 2014 11:19:14 +0900
Subject: NVMe: Add CONFIG_PM_SLEEP to suspend/resume functions

Add CONFIG_PM_SLEEP to suspend/resume functions to fix the following
build warning when CONFIG_PM_SLEEP is not selected. This is because
sleep PM callbacks defined by SIMPLE_DEV_PM_OPS are only used when
the CONFIG_PM_SLEEP is enabled.

drivers/block/nvme-core.c:2541:12: warning: 'nvme_suspend' defined but not used [-Wunused-function]
drivers/block/nvme-core.c:2550:12: warning: 'nvme_resume' defined but not used [-Wunused-function]

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 51824d1f23ea..87dd4c79e65f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2538,6 +2538,7 @@ static void nvme_remove(struct pci_dev *pdev)
 #define nvme_slot_reset NULL
 #define nvme_error_resume NULL
 
+#ifdef CONFIG_PM_SLEEP
 static int nvme_suspend(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -2558,6 +2559,7 @@ static int nvme_resume(struct device *dev)
 	}
 	return 0;
 }
+#endif
 
 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
 
-- 
cgit v1.2.3


From fb35e914b3f88cda9ee6f9d776910c35269c4ecf Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 3 Mar 2014 11:09:47 -0700
Subject: NVMe: Initialize device reference count earlier

If an NVMe device becomes ready but fails to create IO queues, the driver
creates a character device handle so the device can be managed. The
device reference count needs to be initialized before creating the
character device.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 87dd4c79e65f..f565212a9e32 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2464,6 +2464,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release;
 
+	kref_init(&dev->kref);
 	result = nvme_dev_start(dev);
 	if (result) {
 		if (result == -EBUSY)
@@ -2471,7 +2472,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto release_pools;
 	}
 
-	kref_init(&dev->kref);
 	result = nvme_dev_add(dev);
 	if (result)
 		goto shutdown;
-- 
cgit v1.2.3


From 5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 21 Feb 2014 14:13:44 -0700
Subject: NVMe: RCU protected access to io queues

This adds rcu protected access to nvme_queue to fix a race between a
surprise removal freeing the queue and a thread with open reference on
a NVMe block device using that queue.

The queues do not need to be rcu protected during the initialization or
shutdown parts, so I've added a helper function for raw deferencing
to get around the sparse errors.

There is still a hole in the IOCTL path for the same problem, which is
fixed in a subsequent patch.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 91 +++++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 46 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f565212a9e32..b66ab1db4629 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -74,6 +74,7 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
+	struct rcu_head r_head;
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
 	char irqname[24];	/* nvme4294967295-65535\0 */
@@ -262,14 +263,21 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 	return ctx;
 }
 
-struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
+static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
 {
-	return dev->queues[get_cpu() + 1];
+	return rcu_dereference_raw(dev->queues[qid]);
 }
 
-void put_nvmeq(struct nvme_queue *nvmeq)
+struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
+{
+	rcu_read_lock();
+	return rcu_dereference(dev->queues[get_cpu() + 1]);
+}
+
+void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 {
 	put_cpu();
+	rcu_read_unlock();
 }
 
 /**
@@ -852,13 +860,14 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
 int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
+	return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result,
+								ADMIN_TIMEOUT);
 }
 
 static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
 		struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
 {
-	return nvme_submit_async_cmd(dev->queues[0], cmd, cmdinfo,
+	return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
 								ADMIN_TIMEOUT);
 }
 
@@ -985,6 +994,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
 	struct nvme_command cmd;
 	struct nvme_dev *dev = nvmeq->dev;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	struct nvme_queue *adminq;
 
 	if (!nvmeq->qid || info[cmdid].aborted) {
 		if (work_busy(&dev->reset_work))
@@ -1001,7 +1011,8 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
 	if (!dev->abort_limit)
 		return;
 
-	a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion,
+	adminq = rcu_dereference(dev->queues[0]);
+	a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
 								ADMIN_TIMEOUT);
 	if (a_cmdid < 0)
 		return;
@@ -1018,7 +1029,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
 
 	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
 							nvmeq->qid);
-	nvme_submit_cmd(dev->queues[0], &cmd);
+	nvme_submit_cmd(adminq, &cmd);
 }
 
 /**
@@ -1055,8 +1066,10 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 	}
 }
 
-static void nvme_free_queue(struct nvme_queue *nvmeq)
+static void nvme_free_queue(struct rcu_head *r)
 {
+	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
+
 	spin_lock_irq(&nvmeq->q_lock);
 	while (bio_list_peek(&nvmeq->sq_cong)) {
 		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
@@ -1075,10 +1088,13 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
 	int i;
 
+	for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
+		rcu_assign_pointer(dev->queues[i], NULL);
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		nvme_free_queue(dev->queues[i]);
+		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+		rcu_assign_pointer(dev->queues[i], NULL);
+		call_rcu(&nvmeq->r_head, nvme_free_queue);
 		dev->queue_count--;
-		dev->queues[i] = NULL;
 	}
 }
 
@@ -1116,7 +1132,7 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
 
 static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 {
-	struct nvme_queue *nvmeq = dev->queues[qid];
+	struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
 
 	if (!nvmeq)
 		return;
@@ -1168,6 +1184,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->qid = qid;
 	nvmeq->q_suspended = 1;
 	dev->queue_count++;
+	rcu_assign_pointer(dev->queues[qid], nvmeq);
 
 	return nvmeq;
 
@@ -1311,12 +1328,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	if (result < 0)
 		return result;
 
-	nvmeq = dev->queues[0];
+	nvmeq = raw_nvmeq(dev, 0);
 	if (!nvmeq) {
 		nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
 		if (!nvmeq)
 			return -ENOMEM;
-		dev->queues[0] = nvmeq;
 	}
 
 	aqa = nvmeq->q_depth - 1;
@@ -1581,8 +1597,8 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
 	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result,
-								timeout);
+		status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c,
+							&cmd.result, timeout);
 
 	if (cmd.data_len) {
 		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1701,8 +1717,10 @@ static int nvme_kthread(void *data)
 				queue_work(nvme_workq, &dev->reset_work);
 				continue;
 			}
+			rcu_read_lock();
 			for (i = 0; i < dev->queue_count; i++) {
-				struct nvme_queue *nvmeq = dev->queues[i];
+				struct nvme_queue *nvmeq =
+						rcu_dereference(dev->queues[i]);
 				if (!nvmeq)
 					continue;
 				spin_lock_irq(&nvmeq->q_lock);
@@ -1714,6 +1732,7 @@ static int nvme_kthread(void *data)
  unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
+			rcu_read_unlock();
 		}
 		spin_unlock(&dev_list_lock);
 		schedule_timeout(round_jiffies_relative(HZ));
@@ -1808,7 +1827,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	struct nvme_queue *adminq = dev->queues[0];
+	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
 	struct pci_dev *pdev = dev->pci_dev;
 	int result, cpu, i, vecs, nr_io_queues, size, q_depth;
 
@@ -1831,7 +1850,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 			size = db_bar_size(dev, nr_io_queues);
 		} while (1);
 		dev->dbs = ((void __iomem *)dev->bar) + 4096;
-		dev->queues[0]->q_db = dev->dbs;
+		adminq->q_db = dev->dbs;
 	}
 
 	/* Deregister the admin queue's interrupt */
@@ -1880,19 +1899,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	}
 
 	/* Free previously allocated queues that are no longer usable */
-	spin_lock(&dev_list_lock);
-	for (i = dev->queue_count - 1; i > nr_io_queues; i--) {
-		struct nvme_queue *nvmeq = dev->queues[i];
-
-		spin_lock_irq(&nvmeq->q_lock);
-		nvme_cancel_ios(nvmeq, false);
-		spin_unlock_irq(&nvmeq->q_lock);
-
-		nvme_free_queue(nvmeq);
-		dev->queue_count--;
-		dev->queues[i] = NULL;
-	}
-	spin_unlock(&dev_list_lock);
+	nvme_free_queues(dev, nr_io_queues);
 
 	cpu = cpumask_first(cpu_online_mask);
 	for (i = 0; i < nr_io_queues; i++) {
@@ -1903,8 +1910,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
 								NVME_Q_DEPTH);
 	for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
-		dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i);
-		if (!dev->queues[i + 1]) {
+		if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) {
 			result = -ENOMEM;
 			goto free_queues;
 		}
@@ -1912,11 +1918,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	for (; i < num_possible_cpus(); i++) {
 		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
-		dev->queues[i + 1] = dev->queues[target + 1];
+		rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]);
 	}
 
 	for (i = 1; i < dev->queue_count; i++) {
-		result = nvme_create_queue(dev->queues[i], i);
+		result = nvme_create_queue(raw_nvmeq(dev, i), i);
 		if (result) {
 			for (--i; i > 0; i--)
 				nvme_disable_queue(dev, i);
@@ -2180,7 +2186,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	atomic_set(&dq.refcount, 0);
 	dq.worker = &worker;
 	for (i = dev->queue_count - 1; i > 0; i--) {
-		struct nvme_queue *nvmeq = dev->queues[i];
+		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
 
 		if (nvme_suspend_queue(nvmeq))
 			continue;
@@ -2205,7 +2211,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
 		for (i = dev->queue_count - 1; i >= 0; i--) {
-			struct nvme_queue *nvmeq = dev->queues[i];
+			struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
 			nvme_suspend_queue(nvmeq);
 			nvme_clear_queue(nvmeq);
 		}
@@ -2383,18 +2389,10 @@ static int nvme_remove_dead_ctrl(void *arg)
 
 static void nvme_remove_disks(struct work_struct *ws)
 {
-	int i;
 	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
 
 	nvme_dev_remove(dev);
-	spin_lock(&dev_list_lock);
-	for (i = dev->queue_count - 1; i > 0; i--) {
-		BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended);
-		nvme_free_queue(dev->queues[i]);
-		dev->queue_count--;
-		dev->queues[i] = NULL;
-	}
-	spin_unlock(&dev_list_lock);
+	nvme_free_queues(dev, 1);
 }
 
 static int nvme_dev_resume(struct nvme_dev *dev)
@@ -2526,6 +2524,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
 	nvme_free_queues(dev, 0);
+	rcu_barrier();
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
-- 
cgit v1.2.3


From 4f5099af4f3d5f999d8ab7784472d93e810e3912 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 3 Mar 2014 16:39:13 -0700
Subject: NVMe: IOCTL path RCU protect queue access

This adds rcu protected access to a queue in the nvme IOCTL path
to fix potential races between a surprise removal and queue usage in
nvme_submit_sync_cmd. The fix holds the rcu_read_lock() here to prevent
the nvme_queue from freeing while this path is executing so it can't
sleep, and so this path will no longer wait for a available command
id should they all be in use at the time a passthrough IOCTL request
is received.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 82 +++++++++++++++++++++++++++++++----------------
 drivers/block/nvme-scsi.c | 31 +++---------------
 2 files changed, 59 insertions(+), 54 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b66ab1db4629..04664cadadfa 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -268,18 +268,30 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
 	return rcu_dereference_raw(dev->queues[qid]);
 }
 
-struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
+static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
 {
 	rcu_read_lock();
 	return rcu_dereference(dev->queues[get_cpu() + 1]);
 }
 
-void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 {
 	put_cpu();
 	rcu_read_unlock();
 }
 
+static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
+							__acquires(RCU)
+{
+	rcu_read_lock();
+	return rcu_dereference(dev->queues[q_idx]);
+}
+
+static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
 /**
  * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
@@ -292,6 +304,10 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	unsigned long flags;
 	u16 tail;
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	if (nvmeq->q_suspended) {
+		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+		return -EBUSY;
+	}
 	tail = nvmeq->sq_tail;
 	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
 	if (++tail == nvmeq->q_depth)
@@ -812,27 +828,46 @@ static void sync_completion(struct nvme_dev *dev, void *ctx,
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
+						struct nvme_command *cmd,
 						u32 *result, unsigned timeout)
 {
-	int cmdid;
+	int cmdid, ret;
 	struct sync_cmd_info cmdinfo;
+	struct nvme_queue *nvmeq;
+
+	nvmeq = lock_nvmeq(dev, q_idx);
+	if (!nvmeq) {
+		unlock_nvmeq(nvmeq);
+		return -ENODEV;
+	}
 
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
-								timeout);
-	if (cmdid < 0)
+	cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
+	if (cmdid < 0) {
+		unlock_nvmeq(nvmeq);
 		return cmdid;
+	}
 	cmd->common.command_id = cmdid;
 
 	set_current_state(TASK_KILLABLE);
-	nvme_submit_cmd(nvmeq, cmd);
+	ret = nvme_submit_cmd(nvmeq, cmd);
+	if (ret) {
+		free_cmdid(nvmeq, cmdid, NULL);
+		unlock_nvmeq(nvmeq);
+		set_current_state(TASK_RUNNING);
+		return ret;
+	}
+	unlock_nvmeq(nvmeq);
 	schedule_timeout(timeout);
 
 	if (cmdinfo.status == -EINTR) {
-		nvme_abort_command(nvmeq, cmdid);
+		nvmeq = lock_nvmeq(dev, q_idx);
+		if (nvmeq)
+			nvme_abort_command(nvmeq, cmdid);
+		unlock_nvmeq(nvmeq);
 		return -EINTR;
 	}
 
@@ -853,15 +888,20 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
 		return cmdid;
 	cmdinfo->status = -EINTR;
 	cmd->common.command_id = cmdid;
-	nvme_submit_cmd(nvmeq, cmd);
-	return 0;
+	return nvme_submit_cmd(nvmeq, cmd);
 }
 
 int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result,
-								ADMIN_TIMEOUT);
+	return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
+}
+
+int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+								u32 *result)
+{
+	return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
+							NVME_IO_TIMEOUT);
 }
 
 static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
@@ -1434,7 +1474,6 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_dev *dev = ns->dev;
-	struct nvme_queue *nvmeq;
 	struct nvme_user_io io;
 	struct nvme_command c;
 	unsigned length, meta_len;
@@ -1510,20 +1549,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
 	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
-	nvmeq = get_nvmeq(dev);
-	/*
-	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
-	 * disabled.  We may be preempted at any point, and be rescheduled
-	 * to a different CPU.  That will cause cacheline bouncing, but no
-	 * additional races since q_lock already protects against other CPUs.
-	 */
-	put_nvmeq(nvmeq);
 	if (length != (io.nblocks + 1) << ns->lba_shift)
 		status = -ENOMEM;
-	else if (!nvmeq || nvmeq->q_suspended)
-		status = -EBUSY;
 	else
-		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
+		status = nvme_submit_io_cmd(dev, &c, NULL);
 
 	if (meta_len) {
 		if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
@@ -1597,8 +1626,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
 	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c,
-							&cmd.result, timeout);
+		status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
 
 	if (cmd.data_len) {
 		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 4a0ceb64e269..e157e85bb5d7 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2033,7 +2033,6 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int res = SNTI_TRANSLATION_SUCCESS;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	struct nvme_queue *nvmeq;
 	u32 num_cmds;
 	struct nvme_iod *iod;
 	u64 unit_len;
@@ -2106,18 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
 		nvme_offset += unit_num_blocks;
 
-		nvmeq = get_nvmeq(dev);
-		/*
-		 * Since nvme_submit_sync_cmd sleeps, we can't keep
-		 * preemption disabled.  We may be preempted at any
-		 * point, and be rescheduled to a different CPU.  That
-		 * will cause cacheline bouncing, but no additional
-		 * races since q_lock already protects against other
-		 * CPUs.
-		 */
-		put_nvmeq(nvmeq);
-		nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL,
-						NVME_IO_TIMEOUT);
+		nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
 		if (nvme_sc != NVME_SC_SUCCESS) {
 			nvme_unmap_user_pages(dev,
 				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2644,7 +2632,6 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
 	int res = SNTI_TRANSLATION_SUCCESS;
 	int nvme_sc;
-	struct nvme_queue *nvmeq;
 	struct nvme_command c;
 	u8 immed, pcmod, pc, no_flush, start;
 
@@ -2671,10 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			c.common.opcode = nvme_cmd_flush;
 			c.common.nsid = cpu_to_le32(ns->ns_id);
 
-			nvmeq = get_nvmeq(ns->dev);
-			put_nvmeq(nvmeq);
-			nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
-
+			nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
 			res = nvme_trans_status_code(hdr, nvme_sc);
 			if (res)
 				goto out;
@@ -2697,15 +2681,12 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
 	int res = SNTI_TRANSLATION_SUCCESS;
 	int nvme_sc;
 	struct nvme_command c;
-	struct nvme_queue *nvmeq;
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_cmd_flush;
 	c.common.nsid = cpu_to_le32(ns->ns_id);
 
-	nvmeq = get_nvmeq(ns->dev);
-	put_nvmeq(nvmeq);
-	nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
+	nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
 
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
@@ -2872,7 +2853,6 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_dev *dev = ns->dev;
 	struct scsi_unmap_parm_list *plist;
 	struct nvme_dsm_range *range;
-	struct nvme_queue *nvmeq;
 	struct nvme_command c;
 	int i, nvme_sc, res = -ENOMEM;
 	u16 ndesc, list_len;
@@ -2914,10 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.dsm.nr = cpu_to_le32(ndesc - 1);
 	c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
-	nvmeq = get_nvmeq(dev);
-	put_nvmeq(nvmeq);
-
-	nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
+	nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
 	dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
-- 
cgit v1.2.3


From ddcb776286c091189a7b928188112470ec7e9efc Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 24 Mar 2014 10:03:56 -0400
Subject: NVMe: Fix divide-by-zero in nvme_trans_io_get_num_cmds

dev->max_hw_sectors may be zero to indicate the device has no limit on
the number of sectors.  nvme_trans_do_nvme_io() should use the software
limit, since this is guaranteed to be non-zero.

Reported-by: Mundu <mundu2510@gmail.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-scsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index e157e85bb5d7..111c920c1574 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2044,7 +2044,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_command c;
 	u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read);
 	u16 control;
-	u32 max_blocks = nvme_block_nr(ns, dev->max_hw_sectors);
+	u32 max_blocks = queue_max_hw_sectors(ns->queue);
 
 	num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks);
 
-- 
cgit v1.2.3


From 6eb0d698efa9c2a35ec3ca958699717c603f85ee Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 24 Mar 2014 10:11:22 -0400
Subject: NVMe: Replace DEFINE_PCI_DEVICE_TABLE

Checkpatch has started warning against using DEFINE_PCI_DEVICE_TABLE,
so replace it.  Also update the copyright date and bump the module
version number to 0.9.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 04664cadadfa..e9495f0bfad3 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1,6 +1,6 @@
 /*
  * NVM Express device driver
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011-2014, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -2601,7 +2601,7 @@ static const struct pci_error_handlers nvme_err_handler = {
 /* Move to pci_ids.h later */
 #define PCI_CLASS_STORAGE_EXPRESS	0x010802
 
-static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
+static const struct pci_device_id nvme_id_table[] = {
 	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
 	{ 0, }
 };
@@ -2662,6 +2662,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.8");
+MODULE_VERSION("0.9");
 module_init(nvme_init);
 module_exit(nvme_exit);
-- 
cgit v1.2.3


From e25115786ee540fc428a14872ebd4f56252aba32 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 20:41:36 -0500
Subject: switch nbd to sockfd_lookup/sockfd_put

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/nbd.c | 48 +++++++++++++++++++-----------------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 55298db36b2d..3a70ea2f7cd6 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -630,37 +630,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 	}
  
 	case NBD_CLEAR_SOCK: {
-		struct file *file;
-
+		struct socket *sock = nbd->sock;
 		nbd->sock = NULL;
-		file = nbd->file;
-		nbd->file = NULL;
 		nbd_clear_que(nbd);
 		BUG_ON(!list_empty(&nbd->queue_head));
 		BUG_ON(!list_empty(&nbd->waiting_queue));
 		kill_bdev(bdev);
-		if (file)
-			fput(file);
+		if (sock)
+			sockfd_put(sock);
 		return 0;
 	}
 
 	case NBD_SET_SOCK: {
-		struct file *file;
-		if (nbd->file)
+		struct socket *sock;
+		int err;
+		if (nbd->sock)
 			return -EBUSY;
-		file = fget(arg);
-		if (file) {
-			struct inode *inode = file_inode(file);
-			if (S_ISSOCK(inode->i_mode)) {
-				nbd->file = file;
-				nbd->sock = SOCKET_I(inode);
-				if (max_part > 0)
-					bdev->bd_invalidated = 1;
-				nbd->disconnect = 0; /* we're connected now */
-				return 0;
-			} else {
-				fput(file);
-			}
+		sock = sockfd_lookup(arg, &err);
+		if (sock) {
+			nbd->sock = sock;
+			if (max_part > 0)
+				bdev->bd_invalidated = 1;
+			nbd->disconnect = 0; /* we're connected now */
+			return 0;
 		}
 		return -EINVAL;
 	}
@@ -697,12 +689,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 
 	case NBD_DO_IT: {
 		struct task_struct *thread;
-		struct file *file;
+		struct socket *sock;
 		int error;
 
 		if (nbd->pid)
 			return -EBUSY;
-		if (!nbd->file)
+		if (!nbd->sock)
 			return -EINVAL;
 
 		mutex_unlock(&nbd->tx_lock);
@@ -731,15 +723,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (error)
 			return error;
 		sock_shutdown(nbd, 0);
-		file = nbd->file;
-		nbd->file = NULL;
+		sock = nbd->sock;
+		nbd->sock = NULL;
 		nbd_clear_que(nbd);
 		dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
 		kill_bdev(bdev);
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 		set_device_ro(bdev, false);
-		if (file)
-			fput(file);
+		if (sock)
+			sockfd_put(sock);
 		nbd->flags = 0;
 		nbd->bytesize = 0;
 		bdev->bd_inode->i_size = 0;
@@ -875,9 +867,7 @@ static int __init nbd_init(void)
 
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
-		nbd_dev[i].file = NULL;
 		nbd_dev[i].magic = NBD_MAGIC;
-		nbd_dev[i].flags = 0;
 		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
 		spin_lock_init(&nbd_dev[i].queue_lock);
 		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
-- 
cgit v1.2.3


From f730c848affc05fb7262574b06e0cd7e1fa96096 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Feb 2014 21:07:38 -0500
Subject: drbd: don't open-code kernel_recvmsg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/drbd/drbd_receiver.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d073305ffd5e..1385714eccb7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -468,24 +468,14 @@ static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
 
 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 {
-	mm_segment_t oldfs;
 	struct kvec iov = {
 		.iov_base = buf,
 		.iov_len = size,
 	};
 	struct msghdr msg = {
-		.msg_iovlen = 1,
-		.msg_iov = (struct iovec *)&iov,
 		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 	};
-	int rv;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
-	set_fs(oldfs);
-
-	return rv;
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
 }
 
 static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
-- 
cgit v1.2.3


From 62054da65c626dd603190c16805f92cf2cf47d4c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 4 Mar 2014 11:57:17 +0200
Subject: rbd: remove out_partial label in rbd_img_request_fill()

Commit 03507db631c94 ("rbd: fix buffer size for writes to images with
snapshots") moved the call to rbd_img_obj_request_add() up, making the
out_partial label bogus.  Remove it.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 34898d53395b..43b9814edf97 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2190,6 +2190,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 		rbd_segment_name_free(object_name);
 		if (!obj_request)
 			goto out_unwind;
+
 		/*
 		 * set obj_request->img_request before creating the
 		 * osd_request so that it gets the right snapc
@@ -2207,7 +2208,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 								clone_size,
 								GFP_ATOMIC);
 			if (!obj_request->bio_list)
-				goto out_partial;
+				goto out_unwind;
 		} else {
 			unsigned int page_count;
 
@@ -2222,7 +2223,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
 						obj_request);
 		if (!osd_req)
-			goto out_partial;
+			goto out_unwind;
 		obj_request->osd_req = osd_req;
 		obj_request->callback = rbd_img_obj_callback;
 
@@ -2249,8 +2250,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 
 	return 0;
 
-out_partial:
-	rbd_obj_request_put(obj_request);
 out_unwind:
 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
 		rbd_obj_request_put(obj_request);
-- 
cgit v1.2.3


From 42dd037c08c7cd6e3e9af7824b0c1d063f838885 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 4 Mar 2014 11:57:17 +0200
Subject: rbd: fix error paths in rbd_img_request_fill()

Doing rbd_obj_request_put() in rbd_img_request_fill() error paths is
not only insufficient, but also triggers an rbd_assert() in
rbd_obj_request_destroy():

    Assertion failure in rbd_obj_request_destroy() at line 1867:

    rbd_assert(obj_request->img_request == NULL);

rbd_img_obj_request_add() adds obj_requests to the img_request, the
opposite is rbd_img_obj_request_del().  Use it.

Fixes: http://tracker.ceph.com/issues/7327

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 43b9814edf97..b55b7812cf93 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2252,7 +2252,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 
 out_unwind:
 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
-		rbd_obj_request_put(obj_request);
+		rbd_img_obj_request_del(img_request, obj_request);
 
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From 7cc69d42e6950404587bef9489a5ed6f9f6bab4e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 25 Feb 2014 16:22:27 +0200
Subject: libceph: bump CEPH_OSD_MAX_OP to 3

Our longest osd request now contains 3 ops: copyup+hint+write.

Also, CEPH_OSD_MAX_OP value in a BUG_ON in rbd_osd_req_callback() was
hard-coded to 2.  Fix it, and switch to rbd_assert while at it.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b55b7812cf93..4c612c4041b6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1654,7 +1654,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	if (osd_req->r_result < 0)
 		obj_request->result = osd_req->r_result;
 
-	BUG_ON(osd_req->r_num_ops > 2);
+	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
 
 	/*
 	 * We support a 64-bit length, but ultimately it has to be
-- 
cgit v1.2.3


From deb236b300cea3e7a114115571194b9872dbdfd1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 25 Feb 2014 16:22:27 +0200
Subject: rbd: num_ops parameter for rbd_osd_req_create()

In preparation for prefixing rbd writes with an allocation hint
introduce a num_ops parameter for rbd_osd_req_create().  The rationale
is that not every write request is a write op that needs to be prefixed
(e.g. watch op), so the num_ops logic needs to be in the callers.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4c612c4041b6..42ecae1436cc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1718,6 +1718,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 static struct ceph_osd_request *rbd_osd_req_create(
 					struct rbd_device *rbd_dev,
 					bool write_request,
+					unsigned int num_ops,
 					struct rbd_obj_request *obj_request)
 {
 	struct ceph_snap_context *snapc = NULL;
@@ -1733,10 +1734,13 @@ static struct ceph_osd_request *rbd_osd_req_create(
 			snapc = img_request->snapc;
 	}
 
-	/* Allocate and initialize the request, for the single op */
+	rbd_assert(num_ops == 1);
+
+	/* Allocate and initialize the request, for the num_ops ops */
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
+					  GFP_ATOMIC);
 	if (!osd_req)
 		return NULL;	/* ENOMEM */
 
@@ -2220,8 +2224,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 			pages += page_count;
 		}
 
-		osd_req = rbd_osd_req_create(rbd_dev, write_request,
-						obj_request);
+		osd_req = rbd_osd_req_create(rbd_dev, write_request, 1,
+					     obj_request);
 		if (!osd_req)
 			goto out_unwind;
 		obj_request->osd_req = osd_req;
@@ -2602,8 +2606,8 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
 
 	rbd_assert(obj_request->img_request);
 	rbd_dev = obj_request->img_request->rbd_dev;
-	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
-						stat_request);
+	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						   stat_request);
 	if (!stat_request->osd_req)
 		goto out;
 	stat_request->callback = rbd_img_obj_exists_callback;
@@ -2806,7 +2810,8 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
 		return -ENOMEM;
 
 	ret = -ENOMEM;
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						  obj_request);
 	if (!obj_request->osd_req)
 		goto out;
 
@@ -2869,7 +2874,8 @@ static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
 	if (!obj_request)
 		goto out_cancel;
 
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
+						  obj_request);
 	if (!obj_request->osd_req)
 		goto out_cancel;
 
@@ -2977,7 +2983,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 	obj_request->pages = pages;
 	obj_request->page_count = page_count;
 
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						  obj_request);
 	if (!obj_request->osd_req)
 		goto out;
 
@@ -3210,7 +3217,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	obj_request->pages = pages;
 	obj_request->page_count = page_count;
 
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						  obj_request);
 	if (!obj_request->osd_req)
 		goto out;
 
-- 
cgit v1.2.3


From 0ccd59266973047770d5160318561c9189b79c93 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 25 Feb 2014 16:22:28 +0200
Subject: rbd: prefix rbd writes with CEPH_OSD_OP_SETALLOCHINT osd op

In an effort to reduce fragmentation, prefix every rbd write with
a CEPH_OSD_OP_SETALLOCHINT osd op with an expected_write_size value set
to the object size (1 << order).  Backwards compatibility is taken care
of on the libceph/osd side.

"The CEPH_OSD_OP_SETALLOCHINT hint is durable, in that it's enough to
do it once.  The reason every rbd write is prefixed is that rbd doesn't
explicitly create objects and relies on writes creating them
implicitly, so there is no place to stick a single hint op into.  To
get around that we decided to prefix every rbd write with a hint (just
like write and setattr ops, hint op will create an object implicitly if
it doesn't exist)."

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 54 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 15 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 42ecae1436cc..4c95b503b09e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1662,11 +1662,15 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	 */
 	obj_request->xferred = osd_req->r_reply_op_len[0];
 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
+
 	opcode = osd_req->r_ops[0].op;
 	switch (opcode) {
 	case CEPH_OSD_OP_READ:
 		rbd_osd_read_callback(obj_request);
 		break;
+	case CEPH_OSD_OP_SETALLOCHINT:
+		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
+		/* fall through */
 	case CEPH_OSD_OP_WRITE:
 		rbd_osd_write_callback(obj_request);
 		break;
@@ -1715,6 +1719,12 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 			snapc, CEPH_NOSNAP, &mtime);
 }
 
+/*
+ * Create an osd request.  A read request has one osd op (read).
+ * A write request has either one (watch) or two (hint+write) osd ops.
+ * (All rbd data writes are prefixed with an allocation hint op, but
+ * technically osd watch is a write request, hence this distinction.)
+ */
 static struct ceph_osd_request *rbd_osd_req_create(
 					struct rbd_device *rbd_dev,
 					bool write_request,
@@ -1734,7 +1744,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 			snapc = img_request->snapc;
 	}
 
-	rbd_assert(num_ops == 1);
+	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
 
 	/* Allocate and initialize the request, for the num_ops ops */
 
@@ -1760,8 +1770,8 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
 /*
  * Create a copyup osd request based on the information in the
- * object request supplied.  A copyup request has two osd ops,
- * a copyup method call, and a "normal" write request.
+ * object request supplied.  A copyup request has three osd ops,
+ * a copyup method call, a hint op, and a write op.
  */
 static struct ceph_osd_request *
 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
@@ -1777,12 +1787,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	rbd_assert(img_request);
 	rbd_assert(img_request_write_test(img_request));
 
-	/* Allocate and initialize the request, for the two ops */
+	/* Allocate and initialize the request, for the three ops */
 
 	snapc = img_request->snapc;
 	rbd_dev = img_request->rbd_dev;
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
 	if (!osd_req)
 		return NULL;	/* ENOMEM */
 
@@ -2182,6 +2192,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 		const char *object_name;
 		u64 offset;
 		u64 length;
+		unsigned int which = 0;
 
 		object_name = rbd_segment_name(rbd_dev, img_offset);
 		if (!object_name)
@@ -2224,20 +2235,28 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 			pages += page_count;
 		}
 
-		osd_req = rbd_osd_req_create(rbd_dev, write_request, 1,
+		osd_req = rbd_osd_req_create(rbd_dev, write_request,
+					     (write_request ? 2 : 1),
 					     obj_request);
 		if (!osd_req)
 			goto out_unwind;
 		obj_request->osd_req = osd_req;
 		obj_request->callback = rbd_img_obj_callback;
 
-		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
-						0, 0);
+		if (write_request) {
+			osd_req_op_alloc_hint_init(osd_req, which,
+					     rbd_obj_bytes(&rbd_dev->header),
+					     rbd_obj_bytes(&rbd_dev->header));
+			which++;
+		}
+
+		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
+				       0, 0);
 		if (type == OBJ_REQUEST_BIO)
-			osd_req_op_extent_osd_data_bio(osd_req, 0,
+			osd_req_op_extent_osd_data_bio(osd_req, which,
 					obj_request->bio_list, length);
 		else
-			osd_req_op_extent_osd_data_pages(osd_req, 0,
+			osd_req_op_extent_osd_data_pages(osd_req, which,
 					obj_request->pages, length,
 					offset & ~PAGE_MASK, false, false);
 
@@ -2356,7 +2375,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
 
 	/*
 	 * The original osd request is of no use to use any more.
-	 * We need a new one that can hold the two ops in a copyup
+	 * We need a new one that can hold the three ops in a copyup
 	 * request.  Allocate the new copyup osd request for the
 	 * original request, and release the old one.
 	 */
@@ -2375,17 +2394,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
 						false, false);
 
-	/* Then the original write request op */
+	/* Then the hint op */
+
+	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
+				   rbd_obj_bytes(&rbd_dev->header));
+
+	/* And the original write request op */
 
 	offset = orig_request->offset;
 	length = orig_request->length;
-	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
+	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
 					offset, length, 0, 0);
 	if (orig_request->type == OBJ_REQUEST_BIO)
-		osd_req_op_extent_osd_data_bio(osd_req, 1,
+		osd_req_op_extent_osd_data_bio(osd_req, 2,
 					orig_request->bio_list, length);
 	else
-		osd_req_op_extent_osd_data_pages(osd_req, 1,
+		osd_req_op_extent_osd_data_pages(osd_req, 2,
 					orig_request->pages, length,
 					offset & ~PAGE_MASK, false, false);
 
-- 
cgit v1.2.3


From be2d1d56c82d8cf20e6c77515eb499f8e86eb5be Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:00 -0700
Subject: zram: drop `init_done' struct zram member

Introduce init_done() helper function which allows us to drop `init_done'
struct zram member.  init_done() uses the fact that ->init_done == 1
equals to ->meta != NULL.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 21 +++++++++++----------
 drivers/block/zram/zram_drv.h |  1 -
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 51c557cfd92b..b3e6f072c19b 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -42,6 +42,11 @@ static struct zram *zram_devices;
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
+static inline int init_done(struct zram *zram)
+{
+	return zram->meta != NULL;
+}
+
 static inline struct zram *dev_to_zram(struct device *dev)
 {
 	return (struct zram *)dev_to_disk(dev)->private_data;
@@ -60,7 +65,7 @@ static ssize_t initstate_show(struct device *dev,
 {
 	struct zram *zram = dev_to_zram(dev);
 
-	return sprintf(buf, "%u\n", zram->init_done);
+	return sprintf(buf, "%u\n", init_done(zram));
 }
 
 static ssize_t num_reads_show(struct device *dev,
@@ -133,7 +138,7 @@ static ssize_t mem_used_total_show(struct device *dev,
 	struct zram_meta *meta = zram->meta;
 
 	down_read(&zram->init_lock);
-	if (zram->init_done)
+	if (init_done(zram))
 		val = zs_get_total_size_bytes(meta->mem_pool);
 	up_read(&zram->init_lock);
 
@@ -546,14 +551,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	struct zram_meta *meta;
 
 	down_write(&zram->init_lock);
-	if (!zram->init_done) {
+	if (!init_done(zram)) {
 		up_write(&zram->init_lock);
 		return;
 	}
 
 	meta = zram->meta;
-	zram->init_done = 0;
-
 	/* Free all pages that are still in this zram device */
 	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
 		unsigned long handle = meta->table[index].handle;
@@ -594,8 +597,6 @@ static void zram_init_device(struct zram *zram, struct zram_meta *meta)
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
 
 	zram->meta = meta;
-	zram->init_done = 1;
-
 	pr_debug("Initialization done!\n");
 }
 
@@ -615,7 +616,7 @@ static ssize_t disksize_store(struct device *dev,
 	if (!meta)
 		return -ENOMEM;
 	down_write(&zram->init_lock);
-	if (zram->init_done) {
+	if (init_done(zram)) {
 		up_write(&zram->init_lock);
 		zram_meta_free(meta);
 		pr_info("Cannot change disksize for initialized device\n");
@@ -736,7 +737,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 	struct zram *zram = queue->queuedata;
 
 	down_read(&zram->init_lock);
-	if (unlikely(!zram->init_done))
+	if (unlikely(!init_done(zram)))
 		goto error;
 
 	if (!valid_io_request(zram, bio)) {
@@ -859,7 +860,7 @@ static int create_device(struct zram *zram, int device_id)
 		goto out_free_disk;
 	}
 
-	zram->init_done = 0;
+	zram->meta = NULL;
 	return 0;
 
 out_free_disk:
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index ad8aa35bae00..e81e9cdf4147 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -95,7 +95,6 @@ struct zram {
 	struct zram_meta *meta;
 	struct request_queue *queue;
 	struct gendisk *disk;
-	int init_done;
 	/* Prevent concurrent execution of device init, reset and R/W request */
 	struct rw_semaphore init_lock;
 	/*
-- 
cgit v1.2.3


From be257c61306750d11c20d2ac567bf63304c696a3 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:01 -0700
Subject: zram: do not pass rw argument to __zram_make_request()

Do not pass rw argument down the __zram_make_request() -> zram_bvec_rw()
chain, decode it in zram_bvec_rw() instead.  Besides, this is the place
where we distinguish READ and WRITE bio data directions, so account zram
RW stats here, instead of __zram_make_request().  This also allows to
account a real number of zram READ/WRITE operations, not just requests
(single RW request may cause a number of zram RW ops with separate
locking, compression/decompression, etc).

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index b3e6f072c19b..3f0d6de36f74 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -533,14 +533,18 @@ out:
 }
 
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-			int offset, struct bio *bio, int rw)
+			int offset, struct bio *bio)
 {
 	int ret;
+	int rw = bio_data_dir(bio);
 
-	if (rw == READ)
+	if (rw == READ) {
+		atomic64_inc(&zram->stats.num_reads);
 		ret = zram_bvec_read(zram, bvec, index, offset, bio);
-	else
+	} else {
+		atomic64_inc(&zram->stats.num_writes);
 		ret = zram_bvec_write(zram, bvec, index, offset);
+	}
 
 	return ret;
 }
@@ -672,22 +676,13 @@ out:
 	return ret;
 }
 
-static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
+static void __zram_make_request(struct zram *zram, struct bio *bio)
 {
 	int offset;
 	u32 index;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 
-	switch (rw) {
-	case READ:
-		atomic64_inc(&zram->stats.num_reads);
-		break;
-	case WRITE:
-		atomic64_inc(&zram->stats.num_writes);
-		break;
-	}
-
 	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
 	offset = (bio->bi_iter.bi_sector &
 		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
@@ -706,16 +701,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
 			bv.bv_len = max_transfer_size;
 			bv.bv_offset = bvec.bv_offset;
 
-			if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0)
+			if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0)
 				goto out;
 
 			bv.bv_len = bvec.bv_len - max_transfer_size;
 			bv.bv_offset += max_transfer_size;
-			if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0)
+			if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0)
 				goto out;
 		} else
-			if (zram_bvec_rw(zram, &bvec, index, offset, bio, rw)
-			    < 0)
+			if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0)
 				goto out;
 
 		update_position(&index, &offset, &bvec);
@@ -745,7 +739,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 		goto error;
 	}
 
-	__zram_make_request(zram, bio, bio_data_dir(bio));
+	__zram_make_request(zram, bio);
 	up_read(&zram->init_lock);
 
 	return;
-- 
cgit v1.2.3


From b7cccf8b4009bf74df61f3c9d86b95fabd807c11 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:02 -0700
Subject: zram: remove good and bad compress stats

Remove `good' and `bad' compressed sub-requests stats.  RW request may
cause a number of RW sub-requests.  zram used to account `good' compressed
sub-queries (with compressed size less than 50% of original size), `bad'
compressed sub-queries (with compressed size greater that 75% of original
size), leaving sub-requests with compression size between 50% and 75% of
original size not accounted and not reported.  zram already accounts each
sub-request's compression size so we can calculate real device compression
ratio.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 11 -----------
 drivers/block/zram/zram_drv.h |  2 --
 2 files changed, 13 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3f0d6de36f74..995304ef40bb 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -293,7 +293,6 @@ static void zram_free_page(struct zram *zram, size_t index)
 {
 	struct zram_meta *meta = zram->meta;
 	unsigned long handle = meta->table[index].handle;
-	u16 size = meta->table[index].size;
 
 	if (unlikely(!handle)) {
 		/*
@@ -307,14 +306,8 @@ static void zram_free_page(struct zram *zram, size_t index)
 		return;
 	}
 
-	if (unlikely(size > max_zpage_size))
-		atomic_dec(&zram->stats.bad_compress);
-
 	zs_free(meta->mem_pool, handle);
 
-	if (size <= PAGE_SIZE / 2)
-		atomic_dec(&zram->stats.good_compress);
-
 	atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
 	atomic_dec(&zram->stats.pages_stored);
 
@@ -478,7 +471,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	}
 
 	if (unlikely(clen > max_zpage_size)) {
-		atomic_inc(&zram->stats.bad_compress);
 		clen = PAGE_SIZE;
 		src = NULL;
 		if (is_partial_io(bvec))
@@ -518,9 +510,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	/* Update stats */
 	atomic64_add(clen, &zram->stats.compr_size);
 	atomic_inc(&zram->stats.pages_stored);
-	if (clen <= PAGE_SIZE / 2)
-		atomic_inc(&zram->stats.good_compress);
-
 out:
 	if (locked)
 		mutex_unlock(&meta->buffer_lock);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e81e9cdf4147..2f173cb1fd0a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -78,8 +78,6 @@ struct zram_stats {
 	atomic64_t notify_free;	/* no. of swap slot free notifications */
 	atomic_t pages_zero;		/* no. of zero filled pages */
 	atomic_t pages_stored;	/* no. of pages currently stored */
-	atomic_t good_compress;	/* % of pages with compression ratio<=50% */
-	atomic_t bad_compress;	/* % of pages with compression ratio>=75% */
 };
 
 struct zram_meta {
-- 
cgit v1.2.3


From 90a7806ea9b9f7cb4751859cc2506e2d80e36ef1 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:03 -0700
Subject: zram: use atomic64_t for all zram stats

This is a preparation patch for stats code duplication removal.

1) use atomic64_t for `pages_zero' and `pages_stored' zram stats.

2) `compr_size' and `pages_zero' struct zram_stats members did not
   follow the existing device attr naming scheme: zram_stats.ATTR has
   ATTR_show() function.  rename them:

   -- compr_size -> compr_data_size
   -- pages_zero -> zero_pages

Minchan Kim's note:
 If we really have trouble with atomic stat operation, we could
 change it with percpu_counter so that it could solve atomic overhead and
 unnecessary memory space by introducing unsigned long instead of 64bit
 atomic_t.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 18 +++++++++---------
 drivers/block/zram/zram_drv.h |  6 +++---
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 995304ef40bb..1bf97b2f8f3f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -109,7 +109,7 @@ static ssize_t zero_pages_show(struct device *dev,
 {
 	struct zram *zram = dev_to_zram(dev);
 
-	return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero));
+	return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages));
 }
 
 static ssize_t orig_data_size_show(struct device *dev,
@@ -118,7 +118,7 @@ static ssize_t orig_data_size_show(struct device *dev,
 	struct zram *zram = dev_to_zram(dev);
 
 	return sprintf(buf, "%llu\n",
-		(u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
+		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
 }
 
 static ssize_t compr_data_size_show(struct device *dev,
@@ -127,7 +127,7 @@ static ssize_t compr_data_size_show(struct device *dev,
 	struct zram *zram = dev_to_zram(dev);
 
 	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.compr_size));
+			(u64)atomic64_read(&zram->stats.compr_data_size));
 }
 
 static ssize_t mem_used_total_show(struct device *dev,
@@ -301,15 +301,15 @@ static void zram_free_page(struct zram *zram, size_t index)
 		 */
 		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
 			zram_clear_flag(meta, index, ZRAM_ZERO);
-			atomic_dec(&zram->stats.pages_zero);
+			atomic64_dec(&zram->stats.zero_pages);
 		}
 		return;
 	}
 
 	zs_free(meta->mem_pool, handle);
 
-	atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
-	atomic_dec(&zram->stats.pages_stored);
+	atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size);
+	atomic64_dec(&zram->stats.pages_stored);
 
 	meta->table[index].handle = 0;
 	meta->table[index].size = 0;
@@ -452,7 +452,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		zram_set_flag(meta, index, ZRAM_ZERO);
 		write_unlock(&zram->meta->tb_lock);
 
-		atomic_inc(&zram->stats.pages_zero);
+		atomic64_inc(&zram->stats.zero_pages);
 		ret = 0;
 		goto out;
 	}
@@ -508,8 +508,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	write_unlock(&zram->meta->tb_lock);
 
 	/* Update stats */
-	atomic64_add(clen, &zram->stats.compr_size);
-	atomic_inc(&zram->stats.pages_stored);
+	atomic64_add(clen, &zram->stats.compr_data_size);
+	atomic64_inc(&zram->stats.pages_stored);
 out:
 	if (locked)
 		mutex_unlock(&meta->buffer_lock);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 2f173cb1fd0a..58d4ac537f65 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -69,15 +69,15 @@ struct table {
 } __aligned(4);
 
 struct zram_stats {
-	atomic64_t compr_size;	/* compressed size of pages stored */
+	atomic64_t compr_data_size;	/* compressed size of pages stored */
 	atomic64_t num_reads;	/* failed + successful */
 	atomic64_t num_writes;	/* --do-- */
 	atomic64_t failed_reads;	/* should NEVER! happen */
 	atomic64_t failed_writes;	/* can happen when memory is too low */
 	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
 	atomic64_t notify_free;	/* no. of swap slot free notifications */
-	atomic_t pages_zero;		/* no. of zero filled pages */
-	atomic_t pages_stored;	/* no. of pages currently stored */
+	atomic64_t zero_pages;		/* no. of zero filled pages */
+	atomic64_t pages_stored;	/* no. of pages currently stored */
 };
 
 struct zram_meta {
-- 
cgit v1.2.3


From a68eb3b65e658406d386bebef02277f4007b2f45 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:04 -0700
Subject: zram: remove zram stats code duplication

Introduce ZRAM_ATTR_RO macro that generates device_attribute and default
ATTR show() function for existing atomic64_t zram stats.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 82 ++++++++++++-------------------------------
 1 file changed, 23 insertions(+), 59 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 1bf97b2f8f3f..29c35119e82a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -42,6 +42,17 @@ static struct zram *zram_devices;
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
+#define ZRAM_ATTR_RO(name)						\
+static ssize_t zram_attr_##name##_show(struct device *d,		\
+				struct device_attribute *attr, char *b)	\
+{									\
+	struct zram *zram = dev_to_zram(d);				\
+	return sprintf(b, "%llu\n",					\
+		(u64)atomic64_read(&zram->stats.name));			\
+}									\
+static struct device_attribute dev_attr_##name =			\
+	__ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL);
+
 static inline int init_done(struct zram *zram)
 {
 	return zram->meta != NULL;
@@ -63,53 +74,14 @@ static ssize_t disksize_show(struct device *dev,
 static ssize_t initstate_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
+	u32 val;
 	struct zram *zram = dev_to_zram(dev);
 
-	return sprintf(buf, "%u\n", init_done(zram));
-}
-
-static ssize_t num_reads_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.num_reads));
-}
-
-static ssize_t num_writes_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.num_writes));
-}
-
-static ssize_t invalid_io_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.invalid_io));
-}
-
-static ssize_t notify_free_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.notify_free));
-}
-
-static ssize_t zero_pages_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
+	down_read(&zram->init_lock);
+	val = init_done(zram);
+	up_read(&zram->init_lock);
 
-	return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages));
+	return sprintf(buf, "%u\n", val);
 }
 
 static ssize_t orig_data_size_show(struct device *dev,
@@ -121,15 +93,6 @@ static ssize_t orig_data_size_show(struct device *dev,
 		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
 }
 
-static ssize_t compr_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.compr_data_size));
-}
-
 static ssize_t mem_used_total_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -762,15 +725,16 @@ static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
 		disksize_show, disksize_store);
 static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
 static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
-static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL);
-static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL);
-static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL);
-static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL);
-static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL);
 static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
-static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL);
 static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
 
+ZRAM_ATTR_RO(num_reads);
+ZRAM_ATTR_RO(num_writes);
+ZRAM_ATTR_RO(invalid_io);
+ZRAM_ATTR_RO(notify_free);
+ZRAM_ATTR_RO(zero_pages);
+ZRAM_ATTR_RO(compr_data_size);
+
 static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_disksize.attr,
 	&dev_attr_initstate.attr,
-- 
cgit v1.2.3


From 6444724939db5de7390c90f7b4a657159b3b4465 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:05 -0700
Subject: zram: report failed read and write stats

zram accounted but did not report numbers of failed read and write
queries.  make these stats available as failed_reads and failed_writes
attrs.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 29c35119e82a..c06d7fba4237 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -730,6 +730,8 @@ static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
 
 ZRAM_ATTR_RO(num_reads);
 ZRAM_ATTR_RO(num_writes);
+ZRAM_ATTR_RO(failed_reads);
+ZRAM_ATTR_RO(failed_writes);
 ZRAM_ATTR_RO(invalid_io);
 ZRAM_ATTR_RO(notify_free);
 ZRAM_ATTR_RO(zero_pages);
@@ -741,6 +743,8 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_reset.attr,
 	&dev_attr_num_reads.attr,
 	&dev_attr_num_writes.attr,
+	&dev_attr_failed_reads.attr,
+	&dev_attr_failed_writes.attr,
 	&dev_attr_invalid_io.attr,
 	&dev_attr_notify_free.attr,
 	&dev_attr_zero_pages.attr,
-- 
cgit v1.2.3


From 59fc86a4922f1a1c0f69eac758a7e2b2b138aab4 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:06 -0700
Subject: zram: drop not used table `count' member

struct table `count' member is not used.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 58d4ac537f65..1d5b1f5786a8 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -64,7 +64,6 @@ enum zram_pageflags {
 struct table {
 	unsigned long handle;
 	u16 size;	/* object size (excluding header) */
-	u8 count;	/* object ref count (not yet used) */
 	u8 flags;
 } __aligned(4);
 
-- 
cgit v1.2.3


From e64cd51d2fa87733176246101df871a8ac5c7c20 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:07 -0700
Subject: zram: move zram size warning to documentation

Move zram warning about disksize and size of memory correlation to zram
documentation.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index c06d7fba4237..21aee3edcb25 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -535,23 +535,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 
 static void zram_init_device(struct zram *zram, struct zram_meta *meta)
 {
-	if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
-		pr_info(
-		"There is little point creating a zram of greater than "
-		"twice the size of memory since we expect a 2:1 compression "
-		"ratio. Note that zram uses about 0.1%% of the size of "
-		"the disk when not in use so a huge zram is "
-		"wasteful.\n"
-		"\tMemory Size: %lu kB\n"
-		"\tSize you selected: %llu kB\n"
-		"Continuing anyway ...\n",
-		(totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10
-		);
-	}
-
 	/* zram devices sort of resembles non-rotational disks */
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
-
 	zram->meta = meta;
 	pr_debug("Initialization done!\n");
 }
-- 
cgit v1.2.3


From b67d1ec189ffb92cdad9b2bd29475fb1e0166983 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:09 -0700
Subject: zram: delete zram_init_device()

allocate new `zram_meta' in disksize_store() only for uninitialised zram
device, saving a number of allocations and deallocations in case if
disksize_store() was called on currently used device.  at the same time
zram_meta stack variable is not necessary, because we can set ->meta
directly.  there is also no need in setting QUEUE_FLAG_NONROT queue on
every disksize_store(), set it once during device creation.

[minchan@kernel.org: handle zram->meta alloc fail case]
[minchan@kernel.org: prevent lockdep spew of init_lock]
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 21aee3edcb25..76ba67673a90 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -533,14 +533,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	up_write(&zram->init_lock);
 }
 
-static void zram_init_device(struct zram *zram, struct zram_meta *meta)
-{
-	/* zram devices sort of resembles non-rotational disks */
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
-	zram->meta = meta;
-	pr_debug("Initialization done!\n");
-}
-
 static ssize_t disksize_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -556,17 +548,18 @@ static ssize_t disksize_store(struct device *dev,
 	meta = zram_meta_alloc(disksize);
 	if (!meta)
 		return -ENOMEM;
+
 	down_write(&zram->init_lock);
 	if (init_done(zram)) {
-		up_write(&zram->init_lock);
 		zram_meta_free(meta);
+		up_write(&zram->init_lock);
 		pr_info("Cannot change disksize for initialized device\n");
 		return -EBUSY;
 	}
 
+	zram->meta = meta;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-	zram_init_device(zram, meta);
 	up_write(&zram->init_lock);
 
 	return len;
@@ -776,7 +769,8 @@ static int create_device(struct zram *zram, int device_id)
 
 	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
 	set_capacity(zram->disk, 0);
-
+	/* zram devices sort of resembles non-rotational disks */
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
 	/*
 	 * To ensure that we always get PAGE_SIZE aligned
 	 * and n*PAGE_SIZED sized I/O requests.
-- 
cgit v1.2.3


From e7e1ef439d18f9a21521116ea9f2b976d7230e54 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:11 -0700
Subject: zram: introduce compressing backend abstraction

ZRAM performs direct LZO compression algorithm calls, making it the one
and only option.  While LZO is generally performs well, LZ4 algorithm
tends to have a faster decompression (see http://code.google.com/p/lz4/
for full report)

	Name            Ratio  C.speed D.speed
	                        MB/s    MB/s
	LZ4 (r101)      2.084    422    1820
	LZO 2.06        2.106    414     600

Thus, users who have mostly read (decompress) usage scenarious or mixed
workflow (writes with relatively high read ops number) will benefit from
using LZ4 compression backend.

Introduce compressing backend abstraction zcomp in order to support
multiple compression algorithms with the following set of operations:

        .create
        .destroy
        .compress
        .decompress

Schematically zram write() usually contains the following steps:
0) preparation (decompression of partioal IO, etc.)
1) lock buffer_lock mutex (protects meta compress buffers)
2) compress (using meta compress buffers)
3) alloc and map zs_pool object
4) copy compressed data (from meta compress buffers) to object allocated by 3)
5) free previous pool page, assign a new one
6) unlock buffer_lock mutex

As we can see, compressing buffers must remain untouched from 1) to 4),
because, otherwise, concurrent write() can overwrite data.  At the same
time, zram_meta must be aware of a) specific compression algorithm memory
requirements and b) necessary locking to protect compression buffers.  To
remove requirement a) new struct zcomp_strm introduced, which contains a
compress/decompress `buffer' and compression algorithm `private' part.
While struct zcomp implements zcomp_strm stream handling and locking and
removes requirement b) from zram meta.  zcomp ->create() and ->destroy(),
respectively, allocate and deallocate algorithm specific zcomp_strm
`private' part.

Every zcomp has zcomp stream and mutex to protect its compression stream.
Stream usage semantics remains the same -- only one write can hold stream
lock and use its buffers.  zcomp_strm_find() turns caller into exclusive
user of a stream (holding stream mutex until zram release stream), and
zcomp_strm_release() makes zcomp stream available (unlock the stream
mutex).  Hence no concurrent write (compression) operations possible at
the moment.

iozone -t 3 -R -r 16K -s 60M -I +Z

       test            base           patched
--------------------------------------------------
  Initial write      597992.91       591660.58
        Rewrite      609674.34       616054.97
           Read     2404771.75      2452909.12
        Re-read     2459216.81      2470074.44
   Reverse Read     1652769.66      1589128.66
    Stride read     2202441.81      2202173.31
    Random read     2236311.47      2276565.31
 Mixed workload     1423760.41      1709760.06
   Random write      579584.08       615933.86
         Pwrite      597550.02       594933.70
          Pread     1703672.53      1718126.72
         Fwrite     1330497.06      1461054.00
          Fread     3922851.00      3957242.62

Usage examples:

	comp = zcomp_create(NAME) /* NAME e.g. "lzo" */

which initialises compressing backend if requested algorithm is supported.

Compress:
	zstrm = zcomp_strm_find(comp)
	zcomp_compress(comp, zstrm, src, &dst_len)
	[..] /* copy compressed data */
	zcomp_strm_release(comp, zstrm)

Decompress:
	zcomp_decompress(comp, src, src_len, dst);

Free compessing backend and its zcomp stream:
	zcomp_destroy(comp)

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c     | 115 +++++++++++++++++++++++++++++++++++++++++
 drivers/block/zram/zcomp.h     |  58 +++++++++++++++++++++
 drivers/block/zram/zcomp_lzo.c |  47 +++++++++++++++++
 drivers/block/zram/zcomp_lzo.h |  17 ++++++
 4 files changed, 237 insertions(+)
 create mode 100644 drivers/block/zram/zcomp.c
 create mode 100644 drivers/block/zram/zcomp.h
 create mode 100644 drivers/block/zram/zcomp_lzo.c
 create mode 100644 drivers/block/zram/zcomp_lzo.h

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
new file mode 100644
index 000000000000..22f4ae235660
--- /dev/null
+++ b/drivers/block/zram/zcomp.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "zcomp.h"
+#include "zcomp_lzo.h"
+
+static struct zcomp_backend *find_backend(const char *compress)
+{
+	if (strncmp(compress, "lzo", 3) == 0)
+		return &zcomp_lzo;
+	return NULL;
+}
+
+static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	if (zstrm->private)
+		comp->backend->destroy(zstrm->private);
+	free_pages((unsigned long)zstrm->buffer, 1);
+	kfree(zstrm);
+}
+
+/*
+ * allocate new zcomp_strm structure with ->private initialized by
+ * backend, return NULL on error
+ */
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
+{
+	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
+	if (!zstrm)
+		return NULL;
+
+	zstrm->private = comp->backend->create();
+	/*
+	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
+	 * case when compressed size is larger than the original one
+	 */
+	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	if (!zstrm->private || !zstrm->buffer) {
+		zcomp_strm_free(comp, zstrm);
+		zstrm = NULL;
+	}
+	return zstrm;
+}
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
+{
+	mutex_lock(&comp->strm_lock);
+	return comp->zstrm;
+}
+
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	mutex_unlock(&comp->strm_lock);
+}
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+		const unsigned char *src, size_t *dst_len)
+{
+	return comp->backend->compress(src, zstrm->buffer, dst_len,
+			zstrm->private);
+}
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+		size_t src_len, unsigned char *dst)
+{
+	return comp->backend->decompress(src, src_len, dst);
+}
+
+void zcomp_destroy(struct zcomp *comp)
+{
+	zcomp_strm_free(comp, comp->zstrm);
+	kfree(comp);
+}
+
+/*
+ * search available compressors for requested algorithm.
+ * allocate new zcomp and initialize it. return NULL
+ * if requested algorithm is not supported or in case
+ * of init error
+ */
+struct zcomp *zcomp_create(const char *compress)
+{
+	struct zcomp *comp;
+	struct zcomp_backend *backend;
+
+	backend = find_backend(compress);
+	if (!backend)
+		return NULL;
+
+	comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
+	if (!comp)
+		return NULL;
+
+	comp->backend = backend;
+	mutex_init(&comp->strm_lock);
+
+	comp->zstrm = zcomp_strm_alloc(comp);
+	if (!comp->zstrm) {
+		kfree(comp);
+		return NULL;
+	}
+	return comp;
+}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
new file mode 100644
index 000000000000..c9a98e1317fe
--- /dev/null
+++ b/drivers/block/zram/zcomp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_H_
+#define _ZCOMP_H_
+
+#include <linux/mutex.h>
+
+struct zcomp_strm {
+	/* compression/decompression buffer */
+	void *buffer;
+	/*
+	 * The private data of the compression stream, only compression
+	 * stream backend can touch this (e.g. compression algorithm
+	 * working memory)
+	 */
+	void *private;
+};
+
+/* static compression backend */
+struct zcomp_backend {
+	int (*compress)(const unsigned char *src, unsigned char *dst,
+			size_t *dst_len, void *private);
+
+	int (*decompress)(const unsigned char *src, size_t src_len,
+			unsigned char *dst);
+
+	void *(*create)(void);
+	void (*destroy)(void *private);
+
+	const char *name;
+};
+
+/* dynamic per-device compression frontend */
+struct zcomp {
+	struct mutex strm_lock;
+	struct zcomp_strm *zstrm;
+	struct zcomp_backend *backend;
+};
+
+struct zcomp *zcomp_create(const char *comp);
+void zcomp_destroy(struct zcomp *comp);
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm);
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+		const unsigned char *src, size_t *dst_len);
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+		size_t src_len, unsigned char *dst);
+#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
new file mode 100644
index 000000000000..da1bc47d588e
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lzo.h>
+
+#include "zcomp_lzo.h"
+
+static void *lzo_create(void)
+{
+	return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+}
+
+static void lzo_destroy(void *private)
+{
+	kfree(private);
+}
+
+static int lzo_compress(const unsigned char *src, unsigned char *dst,
+		size_t *dst_len, void *private)
+{
+	int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
+	return ret == LZO_E_OK ? 0 : ret;
+}
+
+static int lzo_decompress(const unsigned char *src, size_t src_len,
+		unsigned char *dst)
+{
+	size_t dst_len = PAGE_SIZE;
+	int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
+	return ret == LZO_E_OK ? 0 : ret;
+}
+
+struct zcomp_backend zcomp_lzo = {
+	.compress = lzo_compress,
+	.decompress = lzo_decompress,
+	.create = lzo_create,
+	.destroy = lzo_destroy,
+	.name = "lzo",
+};
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h
new file mode 100644
index 000000000000..128c5807fa14
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZO_H_
+#define _ZCOMP_LZO_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lzo;
+
+#endif /* _ZCOMP_LZO_H_ */
-- 
cgit v1.2.3


From b7ca232ee7e85ed3b18e39eb20a7f458ee1d6047 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:12 -0700
Subject: zram: use zcomp compressing backends

Do not perform direct LZO compress/decompress calls, initialise
and use zcomp LZO backend (single compression stream) instead.

[akpm@linux-foundation.org: resolve conflicts with zram-delete-zram_init_device-fix.patch]
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/Makefile   |  2 +-
 drivers/block/zram/zram_drv.c | 69 +++++++++++++++++++------------------------
 drivers/block/zram/zram_drv.h |  8 ++---
 3 files changed, 36 insertions(+), 43 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index cb0f9ced6a93..757c6a5cadff 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,3 @@
-zram-y	:=	zram_drv.o
+zram-y	:=	zcomp_lzo.o zcomp.o zram_drv.o
 
 obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 76ba67673a90..98823f9ca8b1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -29,7 +29,6 @@
 #include <linux/genhd.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
-#include <linux/lzo.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 
@@ -38,6 +37,7 @@
 /* Globals */
 static int zram_major;
 static struct zram *zram_devices;
+static const char *default_compressor = "lzo";
 
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
@@ -160,8 +160,6 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio)
 static void zram_meta_free(struct zram_meta *meta)
 {
 	zs_destroy_pool(meta->mem_pool);
-	kfree(meta->compress_workmem);
-	free_pages((unsigned long)meta->compress_buffer, 1);
 	vfree(meta->table);
 	kfree(meta);
 }
@@ -173,22 +171,11 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
 	if (!meta)
 		goto out;
 
-	meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
-	if (!meta->compress_workmem)
-		goto free_meta;
-
-	meta->compress_buffer =
-		(void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
-	if (!meta->compress_buffer) {
-		pr_err("Error allocating compressor buffer space\n");
-		goto free_workmem;
-	}
-
 	num_pages = disksize >> PAGE_SHIFT;
 	meta->table = vzalloc(num_pages * sizeof(*meta->table));
 	if (!meta->table) {
 		pr_err("Error allocating zram address table\n");
-		goto free_buffer;
+		goto free_meta;
 	}
 
 	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
@@ -198,15 +185,10 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
 	}
 
 	rwlock_init(&meta->tb_lock);
-	mutex_init(&meta->buffer_lock);
 	return meta;
 
 free_table:
 	vfree(meta->table);
-free_buffer:
-	free_pages((unsigned long)meta->compress_buffer, 1);
-free_workmem:
-	kfree(meta->compress_workmem);
 free_meta:
 	kfree(meta);
 	meta = NULL;
@@ -280,8 +262,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 
 static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 {
-	int ret = LZO_E_OK;
-	size_t clen = PAGE_SIZE;
+	int ret = 0;
 	unsigned char *cmem;
 	struct zram_meta *meta = zram->meta;
 	unsigned long handle;
@@ -301,12 +282,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 	if (size == PAGE_SIZE)
 		copy_page(mem, cmem);
 	else
-		ret = lzo1x_decompress_safe(cmem, size,	mem, &clen);
+		ret = zcomp_decompress(zram->comp, cmem, size, mem);
 	zs_unmap_object(meta->mem_pool, handle);
 	read_unlock(&meta->tb_lock);
 
 	/* Should NEVER happen. Return bio error if it does. */
-	if (unlikely(ret != LZO_E_OK)) {
+	if (unlikely(ret)) {
 		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
 		atomic64_inc(&zram->stats.failed_reads);
 		return ret;
@@ -349,7 +330,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 
 	ret = zram_decompress_page(zram, uncmem, index);
 	/* Should NEVER happen. Return bio error if it does. */
-	if (unlikely(ret != LZO_E_OK))
+	if (unlikely(ret))
 		goto out_cleanup;
 
 	if (is_partial_io(bvec))
@@ -374,11 +355,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	struct page *page;
 	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
 	struct zram_meta *meta = zram->meta;
+	struct zcomp_strm *zstrm;
 	bool locked = false;
 
 	page = bvec->bv_page;
-	src = meta->compress_buffer;
-
 	if (is_partial_io(bvec)) {
 		/*
 		 * This is a partial IO. We need to read the full page
@@ -394,7 +374,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			goto out;
 	}
 
-	mutex_lock(&meta->buffer_lock);
+	zstrm = zcomp_strm_find(zram->comp);
 	locked = true;
 	user_mem = kmap_atomic(page);
 
@@ -420,22 +400,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		goto out;
 	}
 
-	ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
-			       meta->compress_workmem);
+	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
 	if (!is_partial_io(bvec)) {
 		kunmap_atomic(user_mem);
 		user_mem = NULL;
 		uncmem = NULL;
 	}
 
-	if (unlikely(ret != LZO_E_OK)) {
+	if (unlikely(ret)) {
 		pr_err("Compression failed! err=%d\n", ret);
 		goto out;
 	}
-
+	src = zstrm->buffer;
 	if (unlikely(clen > max_zpage_size)) {
 		clen = PAGE_SIZE;
-		src = NULL;
 		if (is_partial_io(bvec))
 			src = uncmem;
 	}
@@ -457,6 +435,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		memcpy(cmem, src, clen);
 	}
 
+	zcomp_strm_release(zram->comp, zstrm);
+	locked = false;
 	zs_unmap_object(meta->mem_pool, handle);
 
 	/*
@@ -475,10 +455,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	atomic64_inc(&zram->stats.pages_stored);
 out:
 	if (locked)
-		mutex_unlock(&meta->buffer_lock);
+		zcomp_strm_release(zram->comp, zstrm);
 	if (is_partial_io(bvec))
 		kfree(uncmem);
-
 	if (ret)
 		atomic64_inc(&zram->stats.failed_writes);
 	return ret;
@@ -522,6 +501,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 		zs_free(meta->mem_pool, handle);
 	}
 
+	zcomp_destroy(zram->comp);
 	zram_meta_free(zram->meta);
 	zram->meta = NULL;
 	/* Reset stats */
@@ -539,6 +519,7 @@ static ssize_t disksize_store(struct device *dev,
 	u64 disksize;
 	struct zram_meta *meta;
 	struct zram *zram = dev_to_zram(dev);
+	int err;
 
 	disksize = memparse(buf, NULL);
 	if (!disksize)
@@ -551,10 +532,17 @@ static ssize_t disksize_store(struct device *dev,
 
 	down_write(&zram->init_lock);
 	if (init_done(zram)) {
-		zram_meta_free(meta);
-		up_write(&zram->init_lock);
 		pr_info("Cannot change disksize for initialized device\n");
-		return -EBUSY;
+		err = -EBUSY;
+		goto out_free_meta;
+	}
+
+	zram->comp = zcomp_create(default_compressor);
+	if (!zram->comp) {
+		pr_info("Cannot initialise %s compressing backend\n",
+				default_compressor);
+		err = -EINVAL;
+		goto out_free_meta;
 	}
 
 	zram->meta = meta;
@@ -563,6 +551,11 @@ static ssize_t disksize_store(struct device *dev,
 	up_write(&zram->init_lock);
 
 	return len;
+
+out_free_meta:
+	up_write(&zram->init_lock);
+	zram_meta_free(meta);
+	return err;
 }
 
 static ssize_t reset_store(struct device *dev,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 1d5b1f5786a8..45e04f7b713f 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -16,9 +16,10 @@
 #define _ZRAM_DRV_H_
 
 #include <linux/spinlock.h>
-#include <linux/mutex.h>
 #include <linux/zsmalloc.h>
 
+#include "zcomp.h"
+
 /*
  * Some arbitrary value. This is just to catch
  * invalid value for num_devices module parameter.
@@ -81,17 +82,16 @@ struct zram_stats {
 
 struct zram_meta {
 	rwlock_t tb_lock;	/* protect table */
-	void *compress_workmem;
-	void *compress_buffer;
 	struct table *table;
 	struct zs_pool *mem_pool;
-	struct mutex buffer_lock; /* protect compress buffers */
 };
 
 struct zram {
 	struct zram_meta *meta;
 	struct request_queue *queue;
 	struct gendisk *disk;
+	struct zcomp *comp;
+
 	/* Prevent concurrent execution of device init, reset and R/W request */
 	struct rw_semaphore init_lock;
 	/*
-- 
cgit v1.2.3


From 9cc97529a180b369fcb7e5265771b6ba7e01f05b Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:13 -0700
Subject: zram: factor out single stream compression

This is preparation patch to add multi stream support to zcomp.

Introduce struct zcomp_strm_single and a set of functions to manage
zcomp_strm stream access.  zcomp_strm_single implements single compession
stream, same way as current zcomp implementation.  This moves zcomp_strm
stream control and locking from zcomp, so compressing backend zcomp is not
aware of required locking.

Single and multi streams require different locking schemes.  Minchan Kim
reported that spinlock-based locking scheme (which is used in multi stream
implementation) has demonstrated a severe perfomance regression for single
compression stream case, comparing to mutex-based.  see
https://lkml.org/lkml/2014/2/18/16

The following set of functions added:
- zcomp_strm_single_find()/zcomp_strm_single_release()
  find and release a compression stream, implement required locking
- zcomp_strm_single_create()/zcomp_strm_single_destroy()
  create and destroy zcomp_strm_single

New ->strm_find() and ->strm_release() callbacks added to zcomp, which are
set to zcomp_strm_single_find() and zcomp_strm_single_release() during
initialisation.  Instead of direct locking and zcomp_strm access from
zcomp_strm_find() and zcomp_strm_release(), zcomp now calls ->strm_find()
and ->strm_release() correspondingly.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c | 62 ++++++++++++++++++++++++++++++++++++++++------
 drivers/block/zram/zcomp.h |  7 ++++--
 2 files changed, 59 insertions(+), 10 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 22f4ae235660..72e8071f9d73 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -16,6 +16,14 @@
 #include "zcomp.h"
 #include "zcomp_lzo.h"
 
+/*
+ * single zcomp_strm backend
+ */
+struct zcomp_strm_single {
+	struct mutex strm_lock;
+	struct zcomp_strm *zstrm;
+};
+
 static struct zcomp_backend *find_backend(const char *compress)
 {
 	if (strncmp(compress, "lzo", 3) == 0)
@@ -54,15 +62,56 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
 	return zstrm;
 }
 
+static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
+{
+	struct zcomp_strm_single *zs = comp->stream;
+	mutex_lock(&zs->strm_lock);
+	return zs->zstrm;
+}
+
+static void zcomp_strm_single_release(struct zcomp *comp,
+		struct zcomp_strm *zstrm)
+{
+	struct zcomp_strm_single *zs = comp->stream;
+	mutex_unlock(&zs->strm_lock);
+}
+
+static void zcomp_strm_single_destroy(struct zcomp *comp)
+{
+	struct zcomp_strm_single *zs = comp->stream;
+	zcomp_strm_free(comp, zs->zstrm);
+	kfree(zs);
+}
+
+static int zcomp_strm_single_create(struct zcomp *comp)
+{
+	struct zcomp_strm_single *zs;
+
+	comp->destroy = zcomp_strm_single_destroy;
+	comp->strm_find = zcomp_strm_single_find;
+	comp->strm_release = zcomp_strm_single_release;
+	zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
+	if (!zs)
+		return -ENOMEM;
+
+	comp->stream = zs;
+	mutex_init(&zs->strm_lock);
+	zs->zstrm = zcomp_strm_alloc(comp);
+	if (!zs->zstrm) {
+		kfree(zs);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
 {
-	mutex_lock(&comp->strm_lock);
-	return comp->zstrm;
+	return comp->strm_find(comp);
 }
 
 void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
 {
-	mutex_unlock(&comp->strm_lock);
+	comp->strm_release(comp, zstrm);
 }
 
 int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
@@ -80,7 +129,7 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
 
 void zcomp_destroy(struct zcomp *comp)
 {
-	zcomp_strm_free(comp, comp->zstrm);
+	comp->destroy(comp);
 	kfree(comp);
 }
 
@@ -104,10 +153,7 @@ struct zcomp *zcomp_create(const char *compress)
 		return NULL;
 
 	comp->backend = backend;
-	mutex_init(&comp->strm_lock);
-
-	comp->zstrm = zcomp_strm_alloc(comp);
-	if (!comp->zstrm) {
+	if (zcomp_strm_single_create(comp) != 0) {
 		kfree(comp);
 		return NULL;
 	}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index c9a98e1317fe..dc3500d842a3 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -39,9 +39,12 @@ struct zcomp_backend {
 
 /* dynamic per-device compression frontend */
 struct zcomp {
-	struct mutex strm_lock;
-	struct zcomp_strm *zstrm;
+	void *stream;
 	struct zcomp_backend *backend;
+
+	struct zcomp_strm *(*strm_find)(struct zcomp *comp);
+	void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
+	void (*destroy)(struct zcomp *comp);
 };
 
 struct zcomp *zcomp_create(const char *comp);
-- 
cgit v1.2.3


From beca3ec71fe5490ee9237dc42400f50402baf83e Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:14 -0700
Subject: zram: add multi stream functionality

Existing zram (zcomp) implementation has only one compression stream
(buffer and algorithm private part), so in order to prevent data
corruption only one write (compress operation) can use this compression
stream, forcing all concurrent write operations to wait for stream lock
to be released.  This patch changes zcomp to keep a compression streams
list of user-defined size (via sysfs device attr).  Each write operation
still exclusively holds compression stream, the difference is that we
can have N write operations (depending on size of streams list)
executing in parallel.  See TEST section later in commit message for
performance data.

Introduce struct zcomp_strm_multi and a set of functions to manage
zcomp_strm stream access.  zcomp_strm_multi has a list of idle
zcomp_strm structs, spinlock to protect idle list and wait queue, making
it possible to perform parallel compressions.

The following set of functions added:
- zcomp_strm_multi_find()/zcomp_strm_multi_release()
  find and release a compression stream, implement required locking
- zcomp_strm_multi_create()/zcomp_strm_multi_destroy()
  create and destroy zcomp_strm_multi

zcomp ->strm_find() and ->strm_release() callbacks are set during
initialisation to zcomp_strm_multi_find()/zcomp_strm_multi_release()
correspondingly.

Each time zcomp issues a zcomp_strm_multi_find() call, the following set
of operations performed:

- spin lock strm_lock
- if idle list is not empty, remove zcomp_strm from idle list, spin
  unlock and return zcomp stream pointer to caller
- if idle list is empty, current adds itself to wait queue. it will be
  awaken by zcomp_strm_multi_release() caller.

zcomp_strm_multi_release():
- spin lock strm_lock
- add zcomp stream to idle list
- spin unlock, wake up sleeper

Minchan Kim reported that spinlock-based locking scheme has demonstrated
a severe perfomance regression for single compression stream case,
comparing to mutex-based (see https://lkml.org/lkml/2014/2/18/16)

base                      spinlock                    mutex

==Initial write           ==Initial write             ==Initial  write
records:  5               records:  5                 records:   5
avg:      1642424.35      avg:      699610.40         avg:       1655583.71
std:      39890.95(2.43%) std:      232014.19(33.16%) std:       52293.96
max:      1690170.94      max:      1163473.45        max:       1697164.75
min:      1568669.52      min:      573429.88         min:       1553410.23
==Rewrite                 ==Rewrite                   ==Rewrite
records:  5               records:  5                 records:   5
avg:      1611775.39      avg:      501406.64         avg:       1684419.11
std:      17144.58(1.06%) std:      15354.41(3.06%)   std:       18367.42
max:      1641800.95      max:      531356.78         max:       1706445.84
min:      1593515.27      min:      488817.78         min:       1655335.73

When only one compression stream available, mutex with spin on owner
tends to perform much better than frequent wait_event()/wake_up().  This
is why single stream implemented as a special case with mutex locking.

Introduce and document zram device attribute max_comp_streams.  This
attr shows and stores current zcomp's max number of zcomp streams
(max_strm).  Extend zcomp's zcomp_create() with `max_strm' parameter.
`max_strm' limits the number of zcomp_strm structs in compression
backend's idle list (max_comp_streams).

max_comp_streams used during initialisation as follows:
-- passing to zcomp_create() max_strm equals to 1 will initialise zcomp
using single compression stream zcomp_strm_single (mutex-based locking).
-- passing to zcomp_create() max_strm greater than 1 will initialise zcomp
using multi compression stream zcomp_strm_multi (spinlock-based locking).

default max_comp_streams value is 1, meaning that zram with single stream
will be initialised.

Later patch will introduce configuration knob to change max_comp_streams
on already initialised and used zcomp.

TEST
iozone -t 3 -R -r 16K -s 60M -I +Z

       test           base       1 strm (mutex)     3 strm (spinlock)
-----------------------------------------------------------------------
 Initial write      589286.78       583518.39          718011.05
       Rewrite      604837.97       596776.38         1515125.72
  Random write      584120.11       595714.58         1388850.25
        Pwrite      535731.17       541117.38          739295.27
        Fwrite     1418083.88      1478612.72         1484927.06

Usage example:
set max_comp_streams to 4
        echo 4 > /sys/block/zram0/max_comp_streams

show current max_comp_streams (default value is 1).
        cat /sys/block/zram0/max_comp_streams

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c    | 124 +++++++++++++++++++++++++++++++++++++++++-
 drivers/block/zram/zcomp.h    |   4 +-
 drivers/block/zram/zram_drv.c |  42 +++++++++++++-
 drivers/block/zram/zram_drv.h |   2 +-
 4 files changed, 167 insertions(+), 5 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 72e8071f9d73..c06f75f54718 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -24,6 +24,21 @@ struct zcomp_strm_single {
 	struct zcomp_strm *zstrm;
 };
 
+/*
+ * multi zcomp_strm backend
+ */
+struct zcomp_strm_multi {
+	/* protect strm list */
+	spinlock_t strm_lock;
+	/* max possible number of zstrm streams */
+	int max_strm;
+	/* number of available zstrm streams */
+	int avail_strm;
+	/* list of available strms */
+	struct list_head idle_strm;
+	wait_queue_head_t strm_wait;
+};
+
 static struct zcomp_backend *find_backend(const char *compress)
 {
 	if (strncmp(compress, "lzo", 3) == 0)
@@ -62,6 +77,107 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
 	return zstrm;
 }
 
+/*
+ * get idle zcomp_strm or wait until other process release
+ * (zcomp_strm_release()) one for us
+ */
+static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+	struct zcomp_strm *zstrm;
+
+	while (1) {
+		spin_lock(&zs->strm_lock);
+		if (!list_empty(&zs->idle_strm)) {
+			zstrm = list_entry(zs->idle_strm.next,
+					struct zcomp_strm, list);
+			list_del(&zstrm->list);
+			spin_unlock(&zs->strm_lock);
+			return zstrm;
+		}
+		/* zstrm streams limit reached, wait for idle stream */
+		if (zs->avail_strm >= zs->max_strm) {
+			spin_unlock(&zs->strm_lock);
+			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
+			continue;
+		}
+		/* allocate new zstrm stream */
+		zs->avail_strm++;
+		spin_unlock(&zs->strm_lock);
+
+		zstrm = zcomp_strm_alloc(comp);
+		if (!zstrm) {
+			spin_lock(&zs->strm_lock);
+			zs->avail_strm--;
+			spin_unlock(&zs->strm_lock);
+			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
+			continue;
+		}
+		break;
+	}
+	return zstrm;
+}
+
+/* add stream back to idle list and wake up waiter or free the stream */
+static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+
+	spin_lock(&zs->strm_lock);
+	if (zs->avail_strm <= zs->max_strm) {
+		list_add(&zstrm->list, &zs->idle_strm);
+		spin_unlock(&zs->strm_lock);
+		wake_up(&zs->strm_wait);
+		return;
+	}
+
+	zs->avail_strm--;
+	spin_unlock(&zs->strm_lock);
+	zcomp_strm_free(comp, zstrm);
+}
+
+static void zcomp_strm_multi_destroy(struct zcomp *comp)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+	struct zcomp_strm *zstrm;
+
+	while (!list_empty(&zs->idle_strm)) {
+		zstrm = list_entry(zs->idle_strm.next,
+				struct zcomp_strm, list);
+		list_del(&zstrm->list);
+		zcomp_strm_free(comp, zstrm);
+	}
+	kfree(zs);
+}
+
+static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
+{
+	struct zcomp_strm *zstrm;
+	struct zcomp_strm_multi *zs;
+
+	comp->destroy = zcomp_strm_multi_destroy;
+	comp->strm_find = zcomp_strm_multi_find;
+	comp->strm_release = zcomp_strm_multi_release;
+	zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
+	if (!zs)
+		return -ENOMEM;
+
+	comp->stream = zs;
+	spin_lock_init(&zs->strm_lock);
+	INIT_LIST_HEAD(&zs->idle_strm);
+	init_waitqueue_head(&zs->strm_wait);
+	zs->max_strm = max_strm;
+	zs->avail_strm = 1;
+
+	zstrm = zcomp_strm_alloc(comp);
+	if (!zstrm) {
+		kfree(zs);
+		return -ENOMEM;
+	}
+	list_add(&zstrm->list, &zs->idle_strm);
+	return 0;
+}
+
 static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
 {
 	struct zcomp_strm_single *zs = comp->stream;
@@ -139,7 +255,7 @@ void zcomp_destroy(struct zcomp *comp)
  * if requested algorithm is not supported or in case
  * of init error
  */
-struct zcomp *zcomp_create(const char *compress)
+struct zcomp *zcomp_create(const char *compress, int max_strm)
 {
 	struct zcomp *comp;
 	struct zcomp_backend *backend;
@@ -153,7 +269,11 @@ struct zcomp *zcomp_create(const char *compress)
 		return NULL;
 
 	comp->backend = backend;
-	if (zcomp_strm_single_create(comp) != 0) {
+	if (max_strm > 1)
+		zcomp_strm_multi_create(comp, max_strm);
+	else
+		zcomp_strm_single_create(comp);
+	if (!comp->stream) {
 		kfree(comp);
 		return NULL;
 	}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index dc3500d842a3..2a3684446160 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -21,6 +21,8 @@ struct zcomp_strm {
 	 * working memory)
 	 */
 	void *private;
+	/* used in multi stream backend, protected by backend strm_lock */
+	struct list_head list;
 };
 
 /* static compression backend */
@@ -47,7 +49,7 @@ struct zcomp {
 	void (*destroy)(struct zcomp *comp);
 };
 
-struct zcomp *zcomp_create(const char *comp);
+struct zcomp *zcomp_create(const char *comp, int max_strm);
 void zcomp_destroy(struct zcomp *comp);
 
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 98823f9ca8b1..bdc7eb8c6df7 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -108,6 +108,40 @@ static ssize_t mem_used_total_show(struct device *dev,
 	return sprintf(buf, "%llu\n", val);
 }
 
+static ssize_t max_comp_streams_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	val = zram->max_comp_streams;
+	up_read(&zram->init_lock);
+
+	return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t max_comp_streams_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int num;
+	struct zram *zram = dev_to_zram(dev);
+
+	if (kstrtoint(buf, 0, &num))
+		return -EINVAL;
+	if (num < 1)
+		return -EINVAL;
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		up_write(&zram->init_lock);
+		pr_info("Can't set max_comp_streams for initialized device\n");
+		return -EBUSY;
+	}
+	zram->max_comp_streams = num;
+	up_write(&zram->init_lock);
+	return len;
+}
+
 /* flag operations needs meta->tb_lock */
 static int zram_test_flag(struct zram_meta *meta, u32 index,
 			enum zram_pageflags flag)
@@ -502,6 +536,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	}
 
 	zcomp_destroy(zram->comp);
+	zram->max_comp_streams = 1;
+
 	zram_meta_free(zram->meta);
 	zram->meta = NULL;
 	/* Reset stats */
@@ -537,7 +573,7 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_free_meta;
 	}
 
-	zram->comp = zcomp_create(default_compressor);
+	zram->comp = zcomp_create(default_compressor, zram->max_comp_streams);
 	if (!zram->comp) {
 		pr_info("Cannot initialise %s compressing backend\n",
 				default_compressor);
@@ -698,6 +734,8 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
 static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
 static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
 static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
+static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
+		max_comp_streams_show, max_comp_streams_store);
 
 ZRAM_ATTR_RO(num_reads);
 ZRAM_ATTR_RO(num_writes);
@@ -722,6 +760,7 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_orig_data_size.attr,
 	&dev_attr_compr_data_size.attr,
 	&dev_attr_mem_used_total.attr,
+	&dev_attr_max_comp_streams.attr,
 	NULL,
 };
 
@@ -784,6 +823,7 @@ static int create_device(struct zram *zram, int device_id)
 	}
 
 	zram->meta = NULL;
+	zram->max_comp_streams = 1;
 	return 0;
 
 out_free_disk:
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 45e04f7b713f..ccf36d11755a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -99,7 +99,7 @@ struct zram {
 	 * we can store in a disk.
 	 */
 	u64 disksize;	/* bytes */
-
+	int max_comp_streams;
 	struct zram_stats stats;
 };
 #endif
-- 
cgit v1.2.3


From fe8eb122c82b2049c460fc6df6e8583a2f935cff Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:15 -0700
Subject: zram: add set_max_streams knob

This patch allows to change max_comp_streams on initialised zcomp.

Introduce zcomp set_max_streams() knob, zcomp_strm_multi_set_max_streams()
and zcomp_strm_single_set_max_streams() callbacks to change streams limit
for zcomp_strm_multi and zcomp_strm_single, accordingly.  set_max_streams
for single steam zcomp does nothing.

If user has lowered the limit, then zcomp_strm_multi_set_max_streams()
attempts to immediately free extra streams (as much as it can, depending
on idle streams availability).

Note, this patch does not allow to change stream 'policy' from single to
multi stream (or vice versa) on already initialised compression backend.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c    | 36 ++++++++++++++++++++++++++++++++++++
 drivers/block/zram/zcomp.h    |  3 +++
 drivers/block/zram/zram_drv.c |  5 ++---
 3 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index c06f75f54718..ac276f79f21c 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -136,6 +136,29 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr
 	zcomp_strm_free(comp, zstrm);
 }
 
+/* change max_strm limit */
+static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+	struct zcomp_strm *zstrm;
+
+	spin_lock(&zs->strm_lock);
+	zs->max_strm = num_strm;
+	/*
+	 * if user has lowered the limit and there are idle streams,
+	 * immediately free as much streams (and memory) as we can.
+	 */
+	while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
+		zstrm = list_entry(zs->idle_strm.next,
+				struct zcomp_strm, list);
+		list_del(&zstrm->list);
+		zcomp_strm_free(comp, zstrm);
+		zs->avail_strm--;
+	}
+	spin_unlock(&zs->strm_lock);
+	return 0;
+}
+
 static void zcomp_strm_multi_destroy(struct zcomp *comp)
 {
 	struct zcomp_strm_multi *zs = comp->stream;
@@ -158,6 +181,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
 	comp->destroy = zcomp_strm_multi_destroy;
 	comp->strm_find = zcomp_strm_multi_find;
 	comp->strm_release = zcomp_strm_multi_release;
+	comp->set_max_streams = zcomp_strm_multi_set_max_streams;
 	zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
 	if (!zs)
 		return -ENOMEM;
@@ -192,6 +216,12 @@ static void zcomp_strm_single_release(struct zcomp *comp,
 	mutex_unlock(&zs->strm_lock);
 }
 
+static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
+{
+	/* zcomp_strm_single support only max_comp_streams == 1 */
+	return -ENOTSUPP;
+}
+
 static void zcomp_strm_single_destroy(struct zcomp *comp)
 {
 	struct zcomp_strm_single *zs = comp->stream;
@@ -206,6 +236,7 @@ static int zcomp_strm_single_create(struct zcomp *comp)
 	comp->destroy = zcomp_strm_single_destroy;
 	comp->strm_find = zcomp_strm_single_find;
 	comp->strm_release = zcomp_strm_single_release;
+	comp->set_max_streams = zcomp_strm_single_set_max_streams;
 	zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
 	if (!zs)
 		return -ENOMEM;
@@ -220,6 +251,11 @@ static int zcomp_strm_single_create(struct zcomp *comp)
 	return 0;
 }
 
+int zcomp_set_max_streams(struct zcomp *comp, int num_strm)
+{
+	return comp->set_max_streams(comp, num_strm);
+}
+
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
 {
 	return comp->strm_find(comp);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 2a3684446160..bd11d59c5dd1 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -46,6 +46,7 @@ struct zcomp {
 
 	struct zcomp_strm *(*strm_find)(struct zcomp *comp);
 	void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
+	int (*set_max_streams)(struct zcomp *comp, int num_strm);
 	void (*destroy)(struct zcomp *comp);
 };
 
@@ -60,4 +61,6 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
 
 int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
 		size_t src_len, unsigned char *dst);
+
+int zcomp_set_max_streams(struct zcomp *comp, int num_strm);
 #endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index bdc7eb8c6df7..3a5f24c341dc 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -133,9 +133,8 @@ static ssize_t max_comp_streams_store(struct device *dev,
 		return -EINVAL;
 	down_write(&zram->init_lock);
 	if (init_done(zram)) {
-		up_write(&zram->init_lock);
-		pr_info("Can't set max_comp_streams for initialized device\n");
-		return -EBUSY;
+		if (zcomp_set_max_streams(zram->comp, num))
+			pr_info("Cannot change max compression streams\n");
 	}
 	zram->max_comp_streams = num;
 	up_write(&zram->init_lock);
-- 
cgit v1.2.3


From e46b8a030d76d3c94156c545c3f4c3676d813435 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:17 -0700
Subject: zram: make compression algorithm selection possible

Add and document `comp_algorithm' device attribute.  This attribute allows
to show supported compression and currently selected compression
algorithms:

	cat /sys/block/zram0/comp_algorithm
	[lzo] lz4

and change selected compression algorithm:
	echo lzo > /sys/block/zram0/comp_algorithm

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c    | 32 +++++++++++++++++++++++++++++---
 drivers/block/zram/zcomp.h    |  2 ++
 drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++++++++++++++++---
 drivers/block/zram/zram_drv.h |  1 +
 4 files changed, 66 insertions(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index ac276f79f21c..aad533a8bc55 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -39,11 +39,20 @@ struct zcomp_strm_multi {
 	wait_queue_head_t strm_wait;
 };
 
+static struct zcomp_backend *backends[] = {
+	&zcomp_lzo,
+	NULL
+};
+
 static struct zcomp_backend *find_backend(const char *compress)
 {
-	if (strncmp(compress, "lzo", 3) == 0)
-		return &zcomp_lzo;
-	return NULL;
+	int i = 0;
+	while (backends[i]) {
+		if (sysfs_streq(compress, backends[i]->name))
+			break;
+		i++;
+	}
+	return backends[i];
 }
 
 static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
@@ -251,6 +260,23 @@ static int zcomp_strm_single_create(struct zcomp *comp)
 	return 0;
 }
 
+/* show available compressors */
+ssize_t zcomp_available_show(const char *comp, char *buf)
+{
+	ssize_t sz = 0;
+	int i = 0;
+
+	while (backends[i]) {
+		if (sysfs_streq(comp, backends[i]->name))
+			sz += sprintf(buf + sz, "[%s] ", backends[i]->name);
+		else
+			sz += sprintf(buf + sz, "%s ", backends[i]->name);
+		i++;
+	}
+	sz += sprintf(buf + sz, "\n");
+	return sz;
+}
+
 int zcomp_set_max_streams(struct zcomp *comp, int num_strm)
 {
 	return comp->set_max_streams(comp, num_strm);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index bd11d59c5dd1..8b8997f8613b 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -50,6 +50,8 @@ struct zcomp {
 	void (*destroy)(struct zcomp *comp);
 };
 
+ssize_t zcomp_available_show(const char *comp, char *buf);
+
 struct zcomp *zcomp_create(const char *comp, int max_strm);
 void zcomp_destroy(struct zcomp *comp);
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3a5f24c341dc..15d46f2e158c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -141,6 +141,34 @@ static ssize_t max_comp_streams_store(struct device *dev,
 	return len;
 }
 
+static ssize_t comp_algorithm_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	size_t sz;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	sz = zcomp_available_show(zram->compressor, buf);
+	up_read(&zram->init_lock);
+
+	return sz;
+}
+
+static ssize_t comp_algorithm_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		up_write(&zram->init_lock);
+		pr_info("Can't change algorithm for initialized device\n");
+		return -EBUSY;
+	}
+	strlcpy(zram->compressor, buf, sizeof(zram->compressor));
+	up_write(&zram->init_lock);
+	return len;
+}
+
 /* flag operations needs meta->tb_lock */
 static int zram_test_flag(struct zram_meta *meta, u32 index,
 			enum zram_pageflags flag)
@@ -572,10 +600,10 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_free_meta;
 	}
 
-	zram->comp = zcomp_create(default_compressor, zram->max_comp_streams);
+	zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams);
 	if (!zram->comp) {
 		pr_info("Cannot initialise %s compressing backend\n",
-				default_compressor);
+				zram->compressor);
 		err = -EINVAL;
 		goto out_free_meta;
 	}
@@ -735,6 +763,8 @@ static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
 static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
 static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
 		max_comp_streams_show, max_comp_streams_store);
+static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR,
+		comp_algorithm_show, comp_algorithm_store);
 
 ZRAM_ATTR_RO(num_reads);
 ZRAM_ATTR_RO(num_writes);
@@ -760,6 +790,7 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_compr_data_size.attr,
 	&dev_attr_mem_used_total.attr,
 	&dev_attr_max_comp_streams.attr,
+	&dev_attr_comp_algorithm.attr,
 	NULL,
 };
 
@@ -820,7 +851,7 @@ static int create_device(struct zram *zram, int device_id)
 		pr_warn("Error creating sysfs group");
 		goto out_free_disk;
 	}
-
+	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
 	zram->meta = NULL;
 	zram->max_comp_streams = 1;
 	return 0;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index ccf36d11755a..7f21c145e317 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -101,5 +101,6 @@ struct zram {
 	u64 disksize;	/* bytes */
 	int max_comp_streams;
 	struct zram_stats stats;
+	char compressor[10];
 };
 #endif
-- 
cgit v1.2.3


From 6e76668e415adf799839f0ab205142ad7002d260 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:18 -0700
Subject: zram: add lz4 algorithm backend

Introduce LZ4 compression backend and make it available for selection.
LZ4 support is optional and requires user to set ZRAM_LZ4_COMPRESS config
option.  The default compression backend is LZO.

TEST

(x86_64, core i5, 2 cores + 2 hyperthreading, zram disk size 1G,
ext4 file system, 3 compression streams)

iozone -t 3 -R -r 16K -s 60M -I +Z

       Test           LZO           LZ4
----------------------------------------------
  Initial write   1642744.62    1317005.09
        Rewrite   2498980.88    1800645.16
           Read   3957026.38    5877043.75
        Re-read   3950997.38    5861847.00
   Reverse Read   2937114.56    5047384.00
    Stride read   2948163.19    4929587.38
    Random read   3292692.69    4880793.62
 Mixed workload   1545602.62    3502940.38
   Random write   2448039.75    1758786.25
         Pwrite   1670051.03    1338329.69
          Pread   2530682.00    5097177.62
         Fwrite   3232085.62    3275942.56
          Fread   6306880.25    6645271.12

So on my system LZ4 is slower in write-only tests, while it performs
better in read-only and mixed (reads + writes) tests.

Official LZ4 benchmarks available here http://code.google.com/p/lz4/
(linux kernel uses revision r90).

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/Kconfig     | 10 +++++++++
 drivers/block/zram/Makefile    |  2 ++
 drivers/block/zram/zcomp.c     |  6 ++++++
 drivers/block/zram/zcomp_lz4.c | 47 ++++++++++++++++++++++++++++++++++++++++++
 drivers/block/zram/zcomp_lz4.h | 17 +++++++++++++++
 5 files changed, 82 insertions(+)
 create mode 100644 drivers/block/zram/zcomp_lz4.c
 create mode 100644 drivers/block/zram/zcomp_lz4.h

(limited to 'drivers/block')

diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 3450be850399..6489c0fd0ea6 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -15,6 +15,16 @@ config ZRAM
 
 	  See zram.txt for more information.
 
+config ZRAM_LZ4_COMPRESS
+	bool "Enable LZ4 algorithm support"
+	depends on ZRAM
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	default n
+	help
+	  This option enables LZ4 compression algorithm support. Compression
+	  algorithm can be changed using `comp_algorithm' device attribute.
+
 config ZRAM_DEBUG
 	bool "Compressed RAM block device debug support"
 	depends on ZRAM
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 757c6a5cadff..be0763ff57a2 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,5 @@
 zram-y	:=	zcomp_lzo.o zcomp.o zram_drv.o
 
+zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
+
 obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index aad533a8bc55..d5919031ca8b 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -15,6 +15,9 @@
 
 #include "zcomp.h"
 #include "zcomp_lzo.h"
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+#include "zcomp_lz4.h"
+#endif
 
 /*
  * single zcomp_strm backend
@@ -41,6 +44,9 @@ struct zcomp_strm_multi {
 
 static struct zcomp_backend *backends[] = {
 	&zcomp_lzo,
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+	&zcomp_lz4,
+#endif
 	NULL
 };
 
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
new file mode 100644
index 000000000000..f2afb7e988c3
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lz4.h>
+
+#include "zcomp_lz4.h"
+
+static void *zcomp_lz4_create(void)
+{
+	return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+}
+
+static void zcomp_lz4_destroy(void *private)
+{
+	kfree(private);
+}
+
+static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
+		size_t *dst_len, void *private)
+{
+	/* return  : Success if return 0 */
+	return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
+}
+
+static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
+		unsigned char *dst)
+{
+	size_t dst_len = PAGE_SIZE;
+	/* return  : Success if return 0 */
+	return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
+}
+
+struct zcomp_backend zcomp_lz4 = {
+	.compress = zcomp_lz4_compress,
+	.decompress = zcomp_lz4_decompress,
+	.create = zcomp_lz4_create,
+	.destroy = zcomp_lz4_destroy,
+	.name = "lz4",
+};
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h
new file mode 100644
index 000000000000..60613fb29dd8
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZ4_H_
+#define _ZCOMP_LZ4_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lz4;
+
+#endif /* _ZCOMP_LZ4_H_ */
-- 
cgit v1.2.3


From d61f98c70e8b0d324e8e83be2ed546d6295e63f3 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:19 -0700
Subject: zram: move comp allocation out of init_lock

While fixing lockdep spew of ->init_lock reported by Sasha Levin [1],
Minchan Kim noted [2] that it's better to move compression backend
allocation (using GPF_KERNEL) out of the ->init_lock lock, same way as
with zram_meta_alloc(), in order to prevent the same lockdep spew.

[1] https://lkml.org/lkml/2014/2/27/337
[2] https://lkml.org/lkml/2014/3/3/32

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reported-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 15d46f2e158c..e4d536b485e6 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -580,9 +580,10 @@ static ssize_t disksize_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	u64 disksize;
+	struct zcomp *comp;
 	struct zram_meta *meta;
 	struct zram *zram = dev_to_zram(dev);
-	int err;
+	int err = -EINVAL;
 
 	disksize = memparse(buf, NULL);
 	if (!disksize)
@@ -593,30 +594,32 @@ static ssize_t disksize_store(struct device *dev,
 	if (!meta)
 		return -ENOMEM;
 
+	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+	if (!comp) {
+		pr_info("Cannot initialise %s compressing backend\n",
+				zram->compressor);
+		goto out_cleanup;
+	}
+
 	down_write(&zram->init_lock);
 	if (init_done(zram)) {
+		up_write(&zram->init_lock);
 		pr_info("Cannot change disksize for initialized device\n");
 		err = -EBUSY;
-		goto out_free_meta;
-	}
-
-	zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams);
-	if (!zram->comp) {
-		pr_info("Cannot initialise %s compressing backend\n",
-				zram->compressor);
-		err = -EINVAL;
-		goto out_free_meta;
+		goto out_cleanup;
 	}
 
 	zram->meta = meta;
+	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
 	up_write(&zram->init_lock);
 
 	return len;
 
-out_free_meta:
-	up_write(&zram->init_lock);
+out_cleanup:
+	if (comp)
+		zcomp_destroy(comp);
 	zram_meta_free(meta);
 	return err;
 }
-- 
cgit v1.2.3


From fcfa8d95cacf5cbbe6dee6b8d229fe86142266e0 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:20 -0700
Subject: zram: return error-valued pointer from zcomp_create()

Instead of returning just NULL, return ERR_PTR from zcomp_create() if
compressing backend creation has failed.  ERR_PTR(-EINVAL) for unsupported
compression algorithm request, ERR_PTR(-ENOMEM) for allocation (zcomp or
compression stream) error.

Perform IS_ERR() check of returned from zcomp_create() value in
disksize_store() and set return code to PTR_ERR().

Change suggested by Jerome Marchand.

[akpm@linux-foundation.org: clean up error recovery flow]
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reported-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c    | 14 ++++++++------
 drivers/block/zram/zram_drv.c | 19 ++++++++++---------
 2 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index d5919031ca8b..5647d8fe1dc1 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
@@ -319,9 +320,10 @@ void zcomp_destroy(struct zcomp *comp)
 
 /*
  * search available compressors for requested algorithm.
- * allocate new zcomp and initialize it. return NULL
- * if requested algorithm is not supported or in case
- * of init error
+ * allocate new zcomp and initialize it. return compressing
+ * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
+ * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
+ * case of allocation error.
  */
 struct zcomp *zcomp_create(const char *compress, int max_strm)
 {
@@ -330,11 +332,11 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
 
 	backend = find_backend(compress);
 	if (!backend)
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
 	if (!comp)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	comp->backend = backend;
 	if (max_strm > 1)
@@ -343,7 +345,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
 		zcomp_strm_single_create(comp);
 	if (!comp->stream) {
 		kfree(comp);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	return comp;
 }
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e4d536b485e6..6b462d27e7d7 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -31,6 +31,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
+#include <linux/err.h>
 
 #include "zram_drv.h"
 
@@ -583,7 +584,7 @@ static ssize_t disksize_store(struct device *dev,
 	struct zcomp *comp;
 	struct zram_meta *meta;
 	struct zram *zram = dev_to_zram(dev);
-	int err = -EINVAL;
+	int err;
 
 	disksize = memparse(buf, NULL);
 	if (!disksize)
@@ -595,18 +596,18 @@ static ssize_t disksize_store(struct device *dev,
 		return -ENOMEM;
 
 	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
-	if (!comp) {
+	if (IS_ERR(comp)) {
 		pr_info("Cannot initialise %s compressing backend\n",
 				zram->compressor);
-		goto out_cleanup;
+		err = PTR_ERR(comp);
+		goto out_free_meta;
 	}
 
 	down_write(&zram->init_lock);
 	if (init_done(zram)) {
-		up_write(&zram->init_lock);
 		pr_info("Cannot change disksize for initialized device\n");
 		err = -EBUSY;
-		goto out_cleanup;
+		goto out_destroy_comp;
 	}
 
 	zram->meta = meta;
@@ -614,12 +615,12 @@ static ssize_t disksize_store(struct device *dev,
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
 	up_write(&zram->init_lock);
-
 	return len;
 
-out_cleanup:
-	if (comp)
-		zcomp_destroy(comp);
+out_destroy_comp:
+	up_write(&zram->init_lock);
+	zcomp_destroy(comp);
+out_free_meta:
 	zram_meta_free(meta);
 	return err;
 }
-- 
cgit v1.2.3


From 60a726e33375a1096e85399cfa1327081b4c38be Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 7 Apr 2014 15:38:21 -0700
Subject: zram: propagate error to user

When we initialized zcomp with single, we couldn't change
max_comp_streams without zram reset but current interface doesn't show
any error to user and even it changes max_comp_streams's value without
any effect so it would make user very confusing.

This patch prevents max_comp_streams's change when zcomp was initialized
as single zcomp and emit the error to user(ex, echo).

[akpm@linux-foundation.org: don't return with the lock held, per Sergey]
[fengguang.wu@intel.com: fix coccinelle warnings]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c    | 10 +++++-----
 drivers/block/zram/zcomp.h    |  4 ++--
 drivers/block/zram/zram_drv.c | 17 +++++++++++++----
 3 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 5647d8fe1dc1..b0e7592c44d8 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -153,7 +153,7 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr
 }
 
 /* change max_strm limit */
-static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
+static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
 {
 	struct zcomp_strm_multi *zs = comp->stream;
 	struct zcomp_strm *zstrm;
@@ -172,7 +172,7 @@ static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
 		zs->avail_strm--;
 	}
 	spin_unlock(&zs->strm_lock);
-	return 0;
+	return true;
 }
 
 static void zcomp_strm_multi_destroy(struct zcomp *comp)
@@ -232,10 +232,10 @@ static void zcomp_strm_single_release(struct zcomp *comp,
 	mutex_unlock(&zs->strm_lock);
 }
 
-static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
+static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
 {
 	/* zcomp_strm_single support only max_comp_streams == 1 */
-	return -ENOTSUPP;
+	return false;
 }
 
 static void zcomp_strm_single_destroy(struct zcomp *comp)
@@ -284,7 +284,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
 	return sz;
 }
 
-int zcomp_set_max_streams(struct zcomp *comp, int num_strm)
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
 {
 	return comp->set_max_streams(comp, num_strm);
 }
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 8b8997f8613b..c59d1fca72c0 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -46,7 +46,7 @@ struct zcomp {
 
 	struct zcomp_strm *(*strm_find)(struct zcomp *comp);
 	void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
-	int (*set_max_streams)(struct zcomp *comp, int num_strm);
+	bool (*set_max_streams)(struct zcomp *comp, int num_strm);
 	void (*destroy)(struct zcomp *comp);
 };
 
@@ -64,5 +64,5 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
 int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
 		size_t src_len, unsigned char *dst);
 
-int zcomp_set_max_streams(struct zcomp *comp, int num_strm);
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
 #endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 6b462d27e7d7..80a1cfca1bf0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -127,19 +127,28 @@ static ssize_t max_comp_streams_store(struct device *dev,
 {
 	int num;
 	struct zram *zram = dev_to_zram(dev);
+	int ret;
 
-	if (kstrtoint(buf, 0, &num))
-		return -EINVAL;
+	ret = kstrtoint(buf, 0, &num);
+	if (ret < 0)
+		return ret;
 	if (num < 1)
 		return -EINVAL;
+
 	down_write(&zram->init_lock);
 	if (init_done(zram)) {
-		if (zcomp_set_max_streams(zram->comp, num))
+		if (!zcomp_set_max_streams(zram->comp, num)) {
 			pr_info("Cannot change max compression streams\n");
+			ret = -EINVAL;
+			goto out;
+		}
 	}
+
 	zram->max_comp_streams = num;
+	ret = len;
+out:
 	up_write(&zram->init_lock);
-	return len;
+	return ret;
 }
 
 static ssize_t comp_algorithm_show(struct device *dev,
-- 
cgit v1.2.3


From 56b4e8cb85827a2ccc4752a2a7148e56b62b7e96 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 7 Apr 2014 15:38:22 -0700
Subject: zram: use scnprintf() in attrs show() methods

sysfs.txt documentation lists the following requirements:

 - The buffer will always be PAGE_SIZE bytes in length. On i386, this
   is 4096.

 - show() methods should return the number of bytes printed into the
   buffer. This is the return value of scnprintf().

 - show() should always use scnprintf().

Use scnprintf() in show() functions.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zcomp.c    |  8 +++++---
 drivers/block/zram/zram_drv.c | 12 ++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index b0e7592c44d8..f1ff39a3d1c1 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -275,12 +275,14 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
 
 	while (backends[i]) {
 		if (sysfs_streq(comp, backends[i]->name))
-			sz += sprintf(buf + sz, "[%s] ", backends[i]->name);
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"[%s] ", backends[i]->name);
 		else
-			sz += sprintf(buf + sz, "%s ", backends[i]->name);
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"%s ", backends[i]->name);
 		i++;
 	}
-	sz += sprintf(buf + sz, "\n");
+	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
 	return sz;
 }
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 80a1cfca1bf0..15e7b8e64cbb 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -48,7 +48,7 @@ static ssize_t zram_attr_##name##_show(struct device *d,		\
 				struct device_attribute *attr, char *b)	\
 {									\
 	struct zram *zram = dev_to_zram(d);				\
-	return sprintf(b, "%llu\n",					\
+	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
 		(u64)atomic64_read(&zram->stats.name));			\
 }									\
 static struct device_attribute dev_attr_##name =			\
@@ -69,7 +69,7 @@ static ssize_t disksize_show(struct device *dev,
 {
 	struct zram *zram = dev_to_zram(dev);
 
-	return sprintf(buf, "%llu\n", zram->disksize);
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
 }
 
 static ssize_t initstate_show(struct device *dev,
@@ -82,7 +82,7 @@ static ssize_t initstate_show(struct device *dev,
 	val = init_done(zram);
 	up_read(&zram->init_lock);
 
-	return sprintf(buf, "%u\n", val);
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
 }
 
 static ssize_t orig_data_size_show(struct device *dev,
@@ -90,7 +90,7 @@ static ssize_t orig_data_size_show(struct device *dev,
 {
 	struct zram *zram = dev_to_zram(dev);
 
-	return sprintf(buf, "%llu\n",
+	return scnprintf(buf, PAGE_SIZE, "%llu\n",
 		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
 }
 
@@ -106,7 +106,7 @@ static ssize_t mem_used_total_show(struct device *dev,
 		val = zs_get_total_size_bytes(meta->mem_pool);
 	up_read(&zram->init_lock);
 
-	return sprintf(buf, "%llu\n", val);
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
 }
 
 static ssize_t max_comp_streams_show(struct device *dev,
@@ -119,7 +119,7 @@ static ssize_t max_comp_streams_show(struct device *dev,
 	val = zram->max_comp_streams;
 	up_read(&zram->init_lock);
 
-	return sprintf(buf, "%d\n", val);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
 }
 
 static ssize_t max_comp_streams_store(struct device *dev,
-- 
cgit v1.2.3


From f4659d8e620d08bd1a84a8aec5d2f5294a242764 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Mon, 7 Apr 2014 15:38:24 -0700
Subject: zram: support REQ_DISCARD

zram is ram based block device and can be used by backend of filesystem.
When filesystem deletes a file, it normally doesn't do anything on data
block of that file.  It just marks on metadata of that file.  This
behavior has no problem on disk based block device, but has problems on
ram based block device, since we can't free memory used for data block.
To overcome this disadvantage, there is REQ_DISCARD functionality.  If
block device support REQ_DISCARD and filesystem is mounted with discard
option, filesystem sends REQ_DISCARD to block device whenever some data
blocks are discarded.  All we have to do is to handle this request.

This patch implements to flag up QUEUE_FLAG_DISCARD and handle this
REQ_DISCARD request.  With it, we can free memory used by zram if it isn't
used.

[akpm@linux-foundation.org: tweak comments]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 15e7b8e64cbb..9849b5233bf4 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -551,6 +551,47 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 	return ret;
 }
 
+/*
+ * zram_bio_discard - handler on discard request
+ * @index: physical block index in PAGE_SIZE units
+ * @offset: byte offset within physical block
+ */
+static void zram_bio_discard(struct zram *zram, u32 index,
+			     int offset, struct bio *bio)
+{
+	size_t n = bio->bi_iter.bi_size;
+
+	/*
+	 * zram manages data in physical block size units. Because logical block
+	 * size isn't identical with physical block size on some arch, we
+	 * could get a discard request pointing to a specific offset within a
+	 * certain physical block.  Although we can handle this request by
+	 * reading that physiclal block and decompressing and partially zeroing
+	 * and re-compressing and then re-storing it, this isn't reasonable
+	 * because our intent with a discard request is to save memory.  So
+	 * skipping this logical block is appropriate here.
+	 */
+	if (offset) {
+		if (n < offset)
+			return;
+
+		n -= offset;
+		index++;
+	}
+
+	while (n >= PAGE_SIZE) {
+		/*
+		 * Discard request can be large so the lock hold times could be
+		 * lengthy.  So take the lock once per page.
+		 */
+		write_lock(&zram->meta->tb_lock);
+		zram_free_page(zram, index);
+		write_unlock(&zram->meta->tb_lock);
+		index++;
+		n -= PAGE_SIZE;
+	}
+}
+
 static void zram_reset_device(struct zram *zram, bool reset_capacity)
 {
 	size_t index;
@@ -686,6 +727,12 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 	offset = (bio->bi_iter.bi_sector &
 		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
+	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+		zram_bio_discard(zram, index, offset, bio);
+		bio_endio(bio, 0);
+		return;
+	}
+
 	bio_for_each_segment(bvec, bio, iter) {
 		int max_transfer_size = PAGE_SIZE - offset;
 
@@ -855,6 +902,21 @@ static int create_device(struct zram *zram, int device_id)
 					ZRAM_LOGICAL_BLOCK_SIZE);
 	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
 	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
+	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
+	zram->disk->queue->limits.max_discard_sectors = UINT_MAX;
+	/*
+	 * zram_bio_discard() will clear all logical blocks if logical block
+	 * size is identical with physical block size(PAGE_SIZE). But if it is
+	 * different, we will skip discarding some parts of logical blocks in
+	 * the part of the request range which isn't aligned to physical block
+	 * size.  So we can't ensure that all discarded logical blocks are
+	 * zeroed.
+	 */
+	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
+		zram->disk->queue->limits.discard_zeroes_data = 1;
+	else
+		zram->disk->queue->limits.discard_zeroes_data = 0;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
 
 	add_disk(zram->disk);
 
-- 
cgit v1.2.3


From 44bd70c347c466616e430b044c49d48fac29789d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <bitbucket@online.de>
Date: Tue, 8 Apr 2014 13:43:52 -0700
Subject: drivers/block/loop.c: ratelimit error messages

Metric tons of high speed spew is not helpful when things go pear shaped.
systemd lost its mind, forgot how to stop services it insists on being
sole manager of, massive printk() flood ensued, box eventually died.

[16206.684000] loop: Write error at byte offset 11412291584, length 4096.
[16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost.
[16206.684000] loop: Write error at byte offset 13155434496, length 4096.
[16206.684000] loop: Write error at byte offset 13155438592, length 4096.
[16206.684000] loop: Write error at byte offset 13155442688, length 4096.
[16206.684000] loop: Write error at byte offset 13960736768, length 4096.
[16206.684000] loop: Write error at byte offset 14229172224, length 4096.
[16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost.
[16206.684000] loop: Write error at byte offset 14766043136, length 4096.
[16206.684000] loop: Write error at byte offset 15034478592, length 4096.
[16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost.

Signed-off-by: Mike Galbraith <bitbucket@online.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 66e8c3b94ef3..f70a230a2945 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -237,7 +237,7 @@ static int __do_lo_send_write(struct file *file,
 	file_end_write(file);
 	if (likely(bw == len))
 		return 0;
-	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
+	printk_ratelimited(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
 			(unsigned long long)pos, len);
 	if (bw >= 0)
 		bw = -EIO;
@@ -277,7 +277,7 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 		return __do_lo_send_write(lo->lo_backing_file,
 				page_address(page), bvec->bv_len,
 				pos);
-	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
+	printk_ratelimited(KERN_ERR "loop: Transfer error at byte offset %llu, "
 			"length %i.\n", (unsigned long long)pos, bvec->bv_len);
 	if (ret > 0)
 		ret = -EIO;
@@ -316,7 +316,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 out:
 	return ret;
 fail:
-	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
+	printk_ratelimited(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
 	ret = -ENOMEM;
 	goto out;
 }
@@ -345,7 +345,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		size = p->bsize;
 
 	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
-		printk(KERN_ERR "loop: transfer error block %ld\n",
+		printk_ratelimited(KERN_ERR "loop: transfer error block %ld\n",
 		       page->index);
 		size = -EINVAL;
 	}
-- 
cgit v1.2.3


From 42f614201e80ff4cfb8b285d7190149a8e1e6cec Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 24 Mar 2014 10:46:25 -0600
Subject: NVMe: per-cpu io queues

The device's IO queues are associated with CPUs, so we can use a per-cpu
variable to map the a qid to a cpu. This provides a convienient way
to optimally assign queues to multiple cpus when the device supports
fewer queues than the host has cpus. The previous implementation may
have assigned these poorly in these situations. This patch addresses
this by sharing queues among cpus that are "close" together and should
have a lower lock contention penalty.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 204 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 167 insertions(+), 37 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e9495f0bfad3..48d7bd55207a 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -20,6 +20,7 @@
 #include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -35,6 +36,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
+#include <linux/percpu.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -96,6 +98,7 @@ struct nvme_queue {
 	u8 cq_phase;
 	u8 cqe_seen;
 	u8 q_suspended;
+	cpumask_var_t cpu_mask;
 	struct async_cmd_info cmdinfo;
 	unsigned long cmdid_data[];
 };
@@ -270,14 +273,15 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
 
 static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
 {
+	unsigned queue_id = get_cpu_var(*dev->io_queue);
 	rcu_read_lock();
-	return rcu_dereference(dev->queues[get_cpu() + 1]);
+	return rcu_dereference(dev->queues[queue_id]);
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 {
-	put_cpu();
 	rcu_read_unlock();
+	put_cpu_var(nvmeq->dev->io_queue);
 }
 
 static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
@@ -1121,6 +1125,8 @@ static void nvme_free_queue(struct rcu_head *r)
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
 	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
 					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	if (nvmeq->qid)
+		free_cpumask_var(nvmeq->cpu_mask);
 	kfree(nvmeq);
 }
 
@@ -1128,8 +1134,6 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
 	int i;
 
-	for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
-		rcu_assign_pointer(dev->queues[i], NULL);
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
 		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
 		rcu_assign_pointer(dev->queues[i], NULL);
@@ -1154,6 +1158,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 		return 1;
 	}
 	nvmeq->q_suspended = 1;
+	nvmeq->dev->online_queues--;
 	spin_unlock_irq(&nvmeq->q_lock);
 
 	irq_set_affinity_hint(vector, NULL);
@@ -1208,6 +1213,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	if (!nvmeq->sq_cmds)
 		goto free_cqdma;
 
+	if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
+		goto free_sqdma;
+
 	nvmeq->q_dmadev = dmadev;
 	nvmeq->dev = dev;
 	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
@@ -1228,6 +1236,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 
 	return nvmeq;
 
+ free_sqdma:
+	dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
+							nvmeq->sq_dma_addr);
  free_cqdma:
 	dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
 							nvmeq->cq_dma_addr);
@@ -1260,6 +1271,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvme_cancel_ios(nvmeq, false);
 	nvmeq->q_suspended = 0;
+	dev->online_queues++;
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
@@ -1835,6 +1847,143 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
+static int nvme_find_closest_node(int node)
+{
+	int n, val, min_val = INT_MAX, best_node = node;
+
+	for_each_online_node(n) {
+		if (n == node)
+			continue;
+		val = node_distance(node, n);
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+	return best_node;
+}
+
+static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
+								int count)
+{
+	int cpu;
+	for_each_cpu(cpu, qmask) {
+		if (cpumask_weight(nvmeq->cpu_mask) >= count)
+			break;
+		if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
+			*per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
+	}
+}
+
+static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+	const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
+{
+	int next_cpu;
+	for_each_cpu(next_cpu, new_mask) {
+		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+		cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
+		cpumask_and(mask, mask, unassigned_cpus);
+		nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
+	}
+}
+
+static void nvme_create_io_queues(struct nvme_dev *dev)
+{
+	unsigned i, max;
+
+	max = min(dev->max_qid, num_online_cpus());
+	for (i = dev->queue_count; i <= max; i++)
+		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
+			break;
+
+	max = min(dev->queue_count - 1, num_online_cpus());
+	for (i = dev->online_queues; i <= max; i++)
+		if (nvme_create_queue(raw_nvmeq(dev, i), i))
+			break;
+}
+
+/*
+ * If there are fewer queues than online cpus, this will try to optimally
+ * assign a queue to multiple cpus by grouping cpus that are "close" together:
+ * thread siblings, core, socket, closest node, then whatever else is
+ * available.
+ */
+static void nvme_assign_io_queues(struct nvme_dev *dev)
+{
+	unsigned cpu, cpus_per_queue, queues, remainder, i;
+	cpumask_var_t unassigned_cpus;
+
+	nvme_create_io_queues(dev);
+
+	queues = min(dev->online_queues - 1, num_online_cpus());
+	if (!queues)
+		return;
+
+	cpus_per_queue = num_online_cpus() / queues;
+	remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
+
+	if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
+		return;
+
+	cpumask_copy(unassigned_cpus, cpu_online_mask);
+	cpu = cpumask_first(unassigned_cpus);
+	for (i = 1; i <= queues; i++) {
+		struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
+		cpumask_t mask;
+
+		cpumask_clear(nvmeq->cpu_mask);
+		if (!cpumask_weight(unassigned_cpus)) {
+			unlock_nvmeq(nvmeq);
+			break;
+		}
+
+		mask = *get_cpu_mask(cpu);
+		nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				topology_thread_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				topology_core_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(cpu_to_node(cpu)),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(
+					nvme_find_closest_node(
+						cpu_to_node(cpu))),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				unassigned_cpus,
+				nvmeq, cpus_per_queue);
+
+		WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
+			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
+			dev->instance, i);
+
+		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+							nvmeq->cpu_mask);
+		cpumask_andnot(unassigned_cpus, unassigned_cpus,
+						nvmeq->cpu_mask);
+		cpu = cpumask_next(cpu, unassigned_cpus);
+		if (remainder && !--remainder)
+			cpus_per_queue++;
+		unlock_nvmeq(nvmeq);
+	}
+	WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
+								dev->instance);
+	i = 0;
+	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+	for_each_cpu(cpu, unassigned_cpus)
+		*per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
+	free_cpumask_var(unassigned_cpus);
+}
+
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -1857,9 +2006,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
 	struct pci_dev *pdev = dev->pci_dev;
-	int result, cpu, i, vecs, nr_io_queues, size, q_depth;
+	int result, i, vecs, nr_io_queues, size;
 
-	nr_io_queues = num_online_cpus();
+	nr_io_queues = num_possible_cpus();
 	result = set_queue_count(dev, nr_io_queues);
 	if (result < 0)
 		return result;
@@ -1919,6 +2068,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * number of interrupts.
 	 */
 	nr_io_queues = vecs;
+	dev->max_qid = nr_io_queues;
 
 	result = queue_request_irq(dev, adminq, adminq->irqname);
 	if (result) {
@@ -1927,36 +2077,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	}
 
 	/* Free previously allocated queues that are no longer usable */
-	nvme_free_queues(dev, nr_io_queues);
-
-	cpu = cpumask_first(cpu_online_mask);
-	for (i = 0; i < nr_io_queues; i++) {
-		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
-		cpu = cpumask_next(cpu, cpu_online_mask);
-	}
-
-	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
-								NVME_Q_DEPTH);
-	for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
-		if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) {
-			result = -ENOMEM;
-			goto free_queues;
-		}
-	}
-
-	for (; i < num_possible_cpus(); i++) {
-		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
-		rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]);
-	}
-
-	for (i = 1; i < dev->queue_count; i++) {
-		result = nvme_create_queue(raw_nvmeq(dev, i), i);
-		if (result) {
-			for (--i; i > 0; i--)
-				nvme_disable_queue(dev, i);
-			goto free_queues;
-		}
-	}
+	nvme_free_queues(dev, nr_io_queues + 1);
+	nvme_assign_io_queues(dev);
 
 	return 0;
 
@@ -2035,6 +2157,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
 static int nvme_dev_map(struct nvme_dev *dev)
 {
+	u64 cap;
 	int bars, result = -ENOMEM;
 	struct pci_dev *pdev = dev->pci_dev;
 
@@ -2058,7 +2181,9 @@ static int nvme_dev_map(struct nvme_dev *dev)
 		result = -ENODEV;
 		goto unmap;
 	}
-	dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap));
+	cap = readq(&dev->bar->cap);
+	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
+	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
 
 	return 0;
@@ -2332,6 +2457,7 @@ static void nvme_free_dev(struct kref *kref)
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
 	nvme_free_namespaces(dev);
+	free_percpu(dev->io_queue);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2477,6 +2603,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 								GFP_KERNEL);
 	if (!dev->queues)
 		goto free;
+	dev->io_queue = alloc_percpu(unsigned short);
+	if (!dev->io_queue)
+		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	INIT_WORK(&dev->reset_work, nvme_reset_failed_dev);
@@ -2526,6 +2655,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  free:
+	free_percpu(dev->io_queue);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
-- 
cgit v1.2.3


From 33b1e95c90447ea73e37e837ea0268a894919f19 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 24 Mar 2014 10:46:26 -0600
Subject: NVMe: CPU hot plug notification

Registers with hot cpu notification to rebalance, and potentially allocate
additional, io queues.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 48d7bd55207a..ce5a4f1a3950 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2002,6 +2002,19 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
 }
 
+static int nvme_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	struct nvme_dev *dev = container_of(self, struct nvme_dev, nb);
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		nvme_assign_io_queues(dev);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
@@ -2080,6 +2093,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	nvme_free_queues(dev, nr_io_queues + 1);
 	nvme_assign_io_queues(dev);
 
+	dev->nb.notifier_call = &nvme_cpu_notify;
+	result = register_hotcpu_notifier(&dev->nb);
+	if (result)
+		goto free_queues;
+
 	return 0;
 
  free_queues:
@@ -2357,6 +2375,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	int i;
 
 	dev->initialized = 0;
+	unregister_hotcpu_notifier(&dev->nb);
 
 	spin_lock(&dev_list_lock);
 	list_del_init(&dev->node);
-- 
cgit v1.2.3


From b355084a891985d4cd0ca23b1a83366af2c4232d Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 4 Apr 2014 11:43:36 -0600
Subject: NVMe: Make I/O timeout a module parameter

Increase the default timeout to 30 seconds to match SCSI.

Signed-off-by: Keith Busch <keith.busch@intel.com>
[use byte instead of ushort]
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index ce5a4f1a3950..7c57b1d955a1 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -50,6 +50,10 @@
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT	(60 * HZ)
 
+unsigned char io_timeout = 30;
+module_param(io_timeout, byte, 0644);
+MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
+
 static int nvme_major;
 module_param(nvme_major, int, 0);
 
-- 
cgit v1.2.3


From b9afca3efb18a9b8392cb544a3e29e8b1168400c Mon Sep 17 00:00:00 2001
From: Dan McLeran <daniel.mcleran@intel.com>
Date: Mon, 7 Apr 2014 17:10:11 -0600
Subject: NVMe: Start-stop nvme_thread during device add-remove.

Done to ensure nvme_thread is not running when there
are no devices to poll.

Signed-off-by: Dan McLeran <daniel.mcleran@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 56 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 14 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 7c57b1d955a1..2d69bfec95a4 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -64,6 +64,7 @@ static DEFINE_SPINLOCK(dev_list_lock);
 static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
 static struct workqueue_struct *nvme_workq;
+static wait_queue_head_t nvme_kthread_wait;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
 
@@ -2374,6 +2375,26 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	kthread_stop(kworker_task);
 }
 
+/*
+* Remove the node from the device list and check
+* for whether or not we need to stop the nvme_thread.
+*/
+static void nvme_dev_list_remove(struct nvme_dev *dev)
+{
+	struct task_struct *tmp = NULL;
+
+	spin_lock(&dev_list_lock);
+	list_del_init(&dev->node);
+	if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
+		tmp = nvme_thread;
+		nvme_thread = NULL;
+	}
+	spin_unlock(&dev_list_lock);
+
+	if (tmp)
+		kthread_stop(tmp);
+}
+
 static void nvme_dev_shutdown(struct nvme_dev *dev)
 {
 	int i;
@@ -2381,9 +2402,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	dev->initialized = 0;
 	unregister_hotcpu_notifier(&dev->nb);
 
-	spin_lock(&dev_list_lock);
-	list_del_init(&dev->node);
-	spin_unlock(&dev_list_lock);
+	nvme_dev_list_remove(dev);
 
 	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
 		for (i = dev->queue_count - 1; i >= 0; i--) {
@@ -2524,6 +2543,7 @@ static const struct file_operations nvme_dev_fops = {
 static int nvme_dev_start(struct nvme_dev *dev)
 {
 	int result;
+	bool start_thread = false;
 
 	result = nvme_dev_map(dev);
 	if (result)
@@ -2534,9 +2554,24 @@ static int nvme_dev_start(struct nvme_dev *dev)
 		goto unmap;
 
 	spin_lock(&dev_list_lock);
+	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
+		start_thread = true;
+		nvme_thread = NULL;
+	}
 	list_add(&dev->node, &dev_list);
 	spin_unlock(&dev_list_lock);
 
+	if (start_thread) {
+		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+		wake_up(&nvme_kthread_wait);
+	} else
+		wait_event_killable(nvme_kthread_wait, nvme_thread);
+
+	if (IS_ERR_OR_NULL(nvme_thread)) {
+		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
+		goto disable;
+	}
+
 	result = nvme_setup_io_queues(dev);
 	if (result && result != -EBUSY)
 		goto disable;
@@ -2545,9 +2580,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
 
  disable:
 	nvme_disable_queue(dev, 0);
-	spin_lock(&dev_list_lock);
-	list_del_init(&dev->node);
-	spin_unlock(&dev_list_lock);
+	nvme_dev_list_remove(dev);
  unmap:
 	nvme_dev_unmap(dev);
 	return result;
@@ -2776,14 +2809,11 @@ static int __init nvme_init(void)
 {
 	int result;
 
-	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
-	if (IS_ERR(nvme_thread))
-		return PTR_ERR(nvme_thread);
+	init_waitqueue_head(&nvme_kthread_wait);
 
-	result = -ENOMEM;
 	nvme_workq = create_singlethread_workqueue("nvme");
 	if (!nvme_workq)
-		goto kill_kthread;
+		return -ENOMEM;
 
 	result = register_blkdev(nvme_major, "nvme");
 	if (result < 0)
@@ -2800,8 +2830,6 @@ static int __init nvme_init(void)
 	unregister_blkdev(nvme_major, "nvme");
  kill_workq:
 	destroy_workqueue(nvme_workq);
- kill_kthread:
-	kthread_stop(nvme_thread);
 	return result;
 }
 
@@ -2810,7 +2838,7 @@ static void __exit nvme_exit(void)
 	pci_unregister_driver(&nvme_driver);
 	unregister_blkdev(nvme_major, "nvme");
 	destroy_workqueue(nvme_workq);
-	kthread_stop(nvme_thread);
+	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
 }
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
-- 
cgit v1.2.3


From 4cc09e2dc4cbe6009c935b6f12a8376f09124bc5 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 2 Apr 2014 15:45:37 -0600
Subject: NVMe: Add getgeo to block ops

Some programs require HDIO_GETGEO work, which requires we implement
getgeo.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 2d69bfec95a4..596e2abd7971 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -25,6 +25,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
+#include <linux/hdreg.h>
 #include <linux/idr.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
@@ -1714,12 +1715,22 @@ static void nvme_release(struct gendisk *disk, fmode_t mode)
 	kref_put(&dev->kref, nvme_free_dev);
 }
 
+static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+	return 0;
+}
+
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
 	.open		= nvme_open,
 	.release	= nvme_release,
+	.getgeo		= nvme_getgeo,
 };
 
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
-- 
cgit v1.2.3


From edd10d33283899fb15d99a290dcc9ceb3604ca78 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 3 Apr 2014 16:45:23 -0600
Subject: NVMe: Retry failed commands with non-fatal errors

For commands returned with failed status, queue these for resubmission
and continue retrying them until success or for a limited amount of
time. The final timeout was arbitrarily chosen so requests can't be
retried indefinitely.

Since these are requeued on the nvmeq that submitted the command, the
callbacks have to take an nvmeq instead of an nvme_dev as a parameter
so that we can use the locked queue to append the iod to retry later.

The nvme_iod conviently can be used to track how long we've been trying
to successfully complete an iod request. The nvme_iod also provides the
nvme prp dma mappings, so I had to move a few things around so we can
keep those mappings.

Signed-off-by: Keith Busch <keith.busch@intel.com>
[fixed checkpatch issue with long line]
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 235 ++++++++++++++++++++++++++++------------------
 drivers/block/nvme-scsi.c |  10 +-
 2 files changed, 151 insertions(+), 94 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 596e2abd7971..efa9c8f4a7a7 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -50,6 +50,7 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT	(60 * HZ)
+#define IOD_TIMEOUT	(4 * NVME_IO_TIMEOUT)
 
 unsigned char io_timeout = 30;
 module_param(io_timeout, byte, 0644);
@@ -94,6 +95,7 @@ struct nvme_queue {
 	wait_queue_head_t sq_full;
 	wait_queue_t sq_cong_wait;
 	struct bio_list sq_cong;
+	struct list_head iod_bio;
 	u32 __iomem *q_db;
 	u16 q_depth;
 	u16 cq_vector;
@@ -128,7 +130,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 }
 
-typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
 						struct nvme_completion *);
 
 struct nvme_cmd_info {
@@ -200,7 +202,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 #define CMD_CTX_ABORT		(0x31C + CMD_CTX_BASE)
 
-static void special_completion(struct nvme_dev *dev, void *ctx,
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	if (ctx == CMD_CTX_CANCELLED)
@@ -208,26 +210,26 @@ static void special_completion(struct nvme_dev *dev, void *ctx,
 	if (ctx == CMD_CTX_FLUSH)
 		return;
 	if (ctx == CMD_CTX_ABORT) {
-		++dev->abort_limit;
+		++nvmeq->dev->abort_limit;
 		return;
 	}
 	if (ctx == CMD_CTX_COMPLETED) {
-		dev_warn(&dev->pci_dev->dev,
+		dev_warn(nvmeq->q_dmadev,
 				"completed id %d twice on queue %d\n",
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
 	if (ctx == CMD_CTX_INVALID) {
-		dev_warn(&dev->pci_dev->dev,
+		dev_warn(nvmeq->q_dmadev,
 				"invalid id %d completed on queue %d\n",
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
 
-	dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
+	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
 }
 
-static void async_completion(struct nvme_dev *dev, void *ctx,
+static void async_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct async_cmd_info *cmdinfo = ctx;
@@ -357,6 +359,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
 		iod->npages = -1;
 		iod->length = nbytes;
 		iod->nents = 0;
+		iod->first_dma = 0ULL;
 		iod->start_time = jiffies;
 	}
 
@@ -405,19 +408,31 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
 	part_stat_unlock();
 }
 
-static void bio_completion(struct nvme_dev *dev, void *ctx,
+static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
 	struct bio *bio = iod->private;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
+	if (unlikely(status)) {
+		if (!(status & NVME_SC_DNR ||
+				bio->bi_rw & REQ_FAILFAST_MASK) &&
+				(jiffies - iod->start_time) < IOD_TIMEOUT) {
+			if (!waitqueue_active(&nvmeq->sq_full))
+				add_wait_queue(&nvmeq->sq_full,
+							&nvmeq->sq_cong_wait);
+			list_add_tail(&iod->node, &nvmeq->iod_bio);
+			wake_up(&nvmeq->sq_full);
+			return;
+		}
+	}
 	if (iod->nents) {
-		dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+		dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 		nvme_end_io_acct(bio, iod->start_time);
 	}
-	nvme_free_iod(dev, iod);
+	nvme_free_iod(nvmeq->dev, iod);
 	if (status)
 		bio_endio(bio, -EIO);
 	else
@@ -425,8 +440,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
-			struct nvme_iod *iod, int total_len, gfp_t gfp)
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
+								gfp_t gfp)
 {
 	struct dma_pool *pool;
 	int length = total_len;
@@ -439,7 +454,6 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
 	dma_addr_t prp_dma;
 	int nprps, i;
 
-	cmd->prp1 = cpu_to_le64(dma_addr);
 	length -= (PAGE_SIZE - offset);
 	if (length <= 0)
 		return total_len;
@@ -454,7 +468,7 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
 	}
 
 	if (length <= PAGE_SIZE) {
-		cmd->prp2 = cpu_to_le64(dma_addr);
+		iod->first_dma = dma_addr;
 		return total_len;
 	}
 
@@ -469,13 +483,12 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
 
 	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
 	if (!prp_list) {
-		cmd->prp2 = cpu_to_le64(dma_addr);
+		iod->first_dma = dma_addr;
 		iod->npages = -1;
 		return (total_len - length) + PAGE_SIZE;
 	}
 	list[0] = prp_list;
 	iod->first_dma = prp_dma;
-	cmd->prp2 = cpu_to_le64(prp_dma);
 	i = 0;
 	for (;;) {
 		if (i == PAGE_SIZE / 8) {
@@ -514,10 +527,11 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
 
 	bio_chain(split, bio);
 
-	if (bio_list_empty(&nvmeq->sq_cong))
+	if (!waitqueue_active(&nvmeq->sq_full))
 		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 	bio_list_add(&nvmeq->sq_cong, split);
 	bio_list_add(&nvmeq->sq_cong, bio);
+	wake_up(&nvmeq->sq_full);
 
 	return 0;
 }
@@ -570,25 +584,13 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 	return length;
 }
 
-/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
 static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		struct bio *bio, struct nvme_iod *iod, int cmdid)
 {
-	struct nvme_dsm_range *range;
+	struct nvme_dsm_range *range =
+				(struct nvme_dsm_range *)iod_list(iod)[0];
 	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
-	range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
-							&iod->first_dma);
-	if (!range)
-		return -ENOMEM;
-
-	iod_list(iod)[0] = (__le64 *)range;
-	iod->npages = 0;
-
 	range->cattr = cpu_to_le32(0);
 	range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
 	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
@@ -635,44 +637,22 @@ int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 	return nvme_submit_flush(nvmeq, ns, cmdid);
 }
 
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-								struct bio *bio)
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 {
+	struct bio *bio = iod->private;
+	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
 	struct nvme_command *cmnd;
-	struct nvme_iod *iod;
-	enum dma_data_direction dma_dir;
-	int cmdid, length, result;
+	int cmdid;
 	u16 control;
 	u32 dsmgmt;
-	int psegs = bio_phys_segments(ns->queue, bio);
-
-	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
-		result = nvme_submit_flush_data(nvmeq, ns);
-		if (result)
-			return result;
-	}
-
-	result = -ENOMEM;
-	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
-	if (!iod)
-		goto nomem;
-	iod->private = bio;
 
-	result = -EBUSY;
 	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
-		goto free_iod;
+		return cmdid;
 
-	if (bio->bi_rw & REQ_DISCARD) {
-		result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
-		if (result)
-			goto free_cmdid;
-		return result;
-	}
-	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+	if (bio->bi_rw & REQ_DISCARD)
+		return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
+	if ((bio->bi_rw & REQ_FLUSH) && !iod->nents)
 		return nvme_submit_flush(nvmeq, ns, cmdid);
 
 	control = 0;
@@ -686,42 +666,85 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
-
 	memset(cmnd, 0, sizeof(*cmnd));
-	if (bio_data_dir(bio)) {
-		cmnd->rw.opcode = nvme_cmd_write;
-		dma_dir = DMA_TO_DEVICE;
-	} else {
-		cmnd->rw.opcode = nvme_cmd_read;
-		dma_dir = DMA_FROM_DEVICE;
-	}
-
-	result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
-	if (result <= 0)
-		goto free_cmdid;
-	length = result;
 
+	cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
-								GFP_ATOMIC);
+	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
-	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
+	cmnd->rw.length =
+		cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
-	nvme_start_io_acct(bio);
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
 
 	return 0;
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								struct bio *bio)
+{
+	struct nvme_iod *iod;
+	int psegs = bio_phys_segments(ns->queue, bio);
+	int result;
+
+	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+		result = nvme_submit_flush_data(nvmeq, ns);
+		if (result)
+			return result;
+	}
+
+	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+	if (!iod)
+		return -ENOMEM;
+
+	iod->private = bio;
+	if (bio->bi_rw & REQ_DISCARD) {
+		void *range;
+		/*
+		 * We reuse the small pool to allocate the 16-byte range here
+		 * as it is not worth having a special pool for these or
+		 * additional cases to handle freeing the iod.
+		 */
+		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
+						GFP_ATOMIC,
+						&iod->first_dma);
+		if (!range) {
+			result = -ENOMEM;
+			goto free_iod;
+		}
+		iod_list(iod)[0] = (__le64 *)range;
+		iod->npages = 0;
+	} else if (psegs) {
+		result = nvme_map_bio(nvmeq, iod, bio,
+			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+			psegs);
+		if (result <= 0)
+			goto free_iod;
+		if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
+								result) {
+			result = -ENOMEM;
+			goto free_iod;
+		}
+		nvme_start_io_acct(bio);
+	}
+	if (unlikely(nvme_submit_iod(nvmeq, iod))) {
+		if (!waitqueue_active(&nvmeq->sq_full))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+		list_add_tail(&iod->node, &nvmeq->iod_bio);
+	}
+	return 0;
 
- free_cmdid:
-	free_cmdid(nvmeq, cmdid, NULL);
  free_iod:
 	nvme_free_iod(nvmeq->dev, iod);
- nomem:
 	return result;
 }
 
@@ -745,7 +768,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 		}
 
 		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
-		fn(nvmeq->dev, ctx, &cqe);
+		fn(nvmeq, ctx, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -781,7 +804,7 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
 	if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
 		result = nvme_submit_bio_queue(nvmeq, ns, bio);
 	if (unlikely(result)) {
-		if (bio_list_empty(&nvmeq->sq_cong))
+		if (!waitqueue_active(&nvmeq->sq_full))
 			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 		bio_list_add(&nvmeq->sq_cong, bio);
 	}
@@ -825,7 +848,7 @@ struct sync_cmd_info {
 	int status;
 };
 
-static void sync_completion(struct nvme_dev *dev, void *ctx,
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
@@ -1112,7 +1135,7 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
 								nvmeq->qid);
 		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-		fn(nvmeq->dev, ctx, &cqe);
+		fn(nvmeq, ctx, &cqe);
 	}
 }
 
@@ -1125,6 +1148,17 @@ static void nvme_free_queue(struct rcu_head *r)
 		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
 		bio_endio(bio, -EIO);
 	}
+	while (!list_empty(&nvmeq->iod_bio)) {
+		static struct nvme_completion cqe = {
+			.status = cpu_to_le16(
+				(NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
+		};
+		struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
+							struct nvme_iod,
+							node);
+		list_del(&iod->node);
+		bio_completion(nvmeq, iod, &cqe);
+	}
 	spin_unlock_irq(&nvmeq->q_lock);
 
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
@@ -1232,6 +1266,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	init_waitqueue_head(&nvmeq->sq_full);
 	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
+	INIT_LIST_HEAD(&nvmeq->iod_bio);
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
@@ -1565,7 +1600,9 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 		c.rw.metadata = cpu_to_le64(meta_dma_addr);
 	}
 
-	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
+	length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+	c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	c.rw.prp2 = cpu_to_le64(iod->first_dma);
 
 	if (length != (io.nblocks + 1) << ns->lba_shift)
 		status = -ENOMEM;
@@ -1635,8 +1672,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
 								length);
 		if (IS_ERR(iod))
 			return PTR_ERR(iod);
-		length = nvme_setup_prps(dev, &c.common, iod, length,
-								GFP_KERNEL);
+		length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+		c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		c.common.prp2 = cpu_to_le64(iod->first_dma);
 	}
 
 	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
@@ -1733,17 +1771,33 @@ static const struct block_device_operations nvme_fops = {
 	.getgeo		= nvme_getgeo,
 };
 
+static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
+{
+	struct nvme_iod *iod, *next;
+
+	list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
+		if (unlikely(nvme_submit_iod(nvmeq, iod)))
+			break;
+		list_del(&iod->node);
+		if (bio_list_empty(&nvmeq->sq_cong) &&
+						list_empty(&nvmeq->iod_bio))
+			remove_wait_queue(&nvmeq->sq_full,
+						&nvmeq->sq_cong_wait);
+	}
+}
+
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 {
 	while (bio_list_peek(&nvmeq->sq_cong)) {
 		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
 		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
 
-		if (bio_list_empty(&nvmeq->sq_cong))
+		if (bio_list_empty(&nvmeq->sq_cong) &&
+						list_empty(&nvmeq->iod_bio))
 			remove_wait_queue(&nvmeq->sq_full,
 							&nvmeq->sq_cong_wait);
 		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-			if (bio_list_empty(&nvmeq->sq_cong))
+			if (!waitqueue_active(&nvmeq->sq_full))
 				add_wait_queue(&nvmeq->sq_full,
 							&nvmeq->sq_cong_wait);
 			bio_list_add_head(&nvmeq->sq_cong, bio);
@@ -1785,6 +1839,7 @@ static int nvme_kthread(void *data)
 				nvme_process_cq(nvmeq);
 				nvme_cancel_ios(nvmeq, true);
 				nvme_resubmit_bios(nvmeq);
+				nvme_resubmit_iods(nvmeq);
  unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 111c920c1574..2c3f5be06da1 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -1562,13 +1562,14 @@ static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			res = PTR_ERR(iod);
 			goto out;
 		}
-		length = nvme_setup_prps(dev, &c.common, iod, tot_len,
-								GFP_KERNEL);
+		length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL);
 		if (length != tot_len) {
 			res = -ENOMEM;
 			goto out_unmap;
 		}
 
+		c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		c.dlfw.prp2 = cpu_to_le64(iod->first_dma);
 		c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
 		c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
 	} else if (opcode == nvme_admin_activate_fw) {
@@ -2092,8 +2093,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			res = PTR_ERR(iod);
 			goto out;
 		}
-		retcode = nvme_setup_prps(dev, &c.common, iod, unit_len,
-							GFP_KERNEL);
+		retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL);
 		if (retcode != unit_len) {
 			nvme_unmap_user_pages(dev,
 				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2102,6 +2102,8 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			res = -ENOMEM;
 			goto out;
 		}
+		c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		c.rw.prp2 = cpu_to_le64(iod->first_dma);
 
 		nvme_offset += unit_num_blocks;
 
-- 
cgit v1.2.3