From 5261b85e586afe6ebe54e16e0a8acc32fc6d4902 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 13 Mar 2014 11:23:39 +1030 Subject: virtio_blk: don't crash, report error if virtqueue is broken. A bad implementation of virtio might cause us to mark the virtqueue broken: we'll dev_err() in that case, and the device is useless, but let's not BUG_ON(). ENOMEM or ENOSPC implies the ring is full, and we should try again later (-ENOMEM is documented to happen, but doesn't, as we fall through to ENOSPC). EIO means it's broken. Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index b1cb3f4c4db4..a2db9ed288f2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -158,6 +158,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) unsigned long flags; unsigned int num; const bool last = (req->cmd_flags & REQ_END) != 0; + int err; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); @@ -198,11 +199,16 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) } spin_lock_irqsave(&vblk->vq_lock, flags); - if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) { + err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); + if (err) { virtqueue_kick(vblk->vq); spin_unlock_irqrestore(&vblk->vq_lock, flags); blk_mq_stop_hw_queue(hctx); - return BLK_MQ_RQ_QUEUE_BUSY; + /* Out of mem doesn't actually happen, since we fall back + * to direct descriptors */ + if (err == -ENOMEM || err == -ENOSPC) + return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_MQ_RQ_QUEUE_ERROR; } if (last) -- cgit v1.2.3 From fc4324b4597c4eb8907207e82f9a6acec84dd335 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 19 Mar 2014 17:08:24 +1030 Subject: virtio-blk: base queue-depth on virtqueue ringsize or module param Venkatash spake thus: virtio-blk set the default queue depth to 64 requests, which was insufficient for high-IOPS devices. Instead set the blk-queue depth to the device's virtqueue depth divided by two (each I/O requires at least two VQ entries). But behold, Ted added a module parameter: Also allow the queue depth to be something which can be set at module load time or via a kernel boot-time parameter, for testing/benchmarking purposes. And I rewrote it substantially, mainly to take VIRTIO_RING_F_INDIRECT_DESC into account. As QEMU sets the vq size for PCI to 128, Venkatash's patch wouldn't have made a change. This version does (since QEMU also offers VIRTIO_RING_F_INDIRECT_DESC. Inspired-by: "Theodore Ts'o" Based-on-the-true-story-of: Venkatesh Srinivas Cc: "Michael S. Tsirkin" Cc: virtio-dev@lists.oasis-open.org Cc: virtualization@lists.linux-foundation.org Cc: Frank Swiderski Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index a2db9ed288f2..196222271a50 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -491,10 +491,11 @@ static struct blk_mq_ops virtio_mq_ops = { static struct blk_mq_reg virtio_mq_reg = { .ops = &virtio_mq_ops, .nr_hw_queues = 1, - .queue_depth = 64, + .queue_depth = 0, /* Set in virtblk_probe */ .numa_node = NUMA_NO_NODE, .flags = BLK_MQ_F_SHOULD_MERGE, }; +module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444); static void virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx, struct request *rq, unsigned int nr) @@ -558,6 +559,13 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_free_vq; } + /* Default queue sizing is to fill the ring. */ + if (!virtio_mq_reg.queue_depth) { + virtio_mq_reg.queue_depth = vblk->vq->num_free; + /* ... but without indirect descs, we use 2 descs per req */ + if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) + virtio_mq_reg.queue_depth /= 2; + } virtio_mq_reg.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems; -- cgit v1.2.3 From 671a6018db1d359254c8201704cf967efb42fc25 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 13 Feb 2014 11:19:14 +0900 Subject: NVMe: Add CONFIG_PM_SLEEP to suspend/resume functions Add CONFIG_PM_SLEEP to suspend/resume functions to fix the following build warning when CONFIG_PM_SLEEP is not selected. This is because sleep PM callbacks defined by SIMPLE_DEV_PM_OPS are only used when the CONFIG_PM_SLEEP is enabled. drivers/block/nvme-core.c:2541:12: warning: 'nvme_suspend' defined but not used [-Wunused-function] drivers/block/nvme-core.c:2550:12: warning: 'nvme_resume' defined but not used [-Wunused-function] Signed-off-by: Jingoo Han Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 51824d1f23ea..87dd4c79e65f 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2538,6 +2538,7 @@ static void nvme_remove(struct pci_dev *pdev) #define nvme_slot_reset NULL #define nvme_error_resume NULL +#ifdef CONFIG_PM_SLEEP static int nvme_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -2558,6 +2559,7 @@ static int nvme_resume(struct device *dev) } return 0; } +#endif static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); -- cgit v1.2.3 From fb35e914b3f88cda9ee6f9d776910c35269c4ecf Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 3 Mar 2014 11:09:47 -0700 Subject: NVMe: Initialize device reference count earlier If an NVMe device becomes ready but fails to create IO queues, the driver creates a character device handle so the device can be managed. The device reference count needs to be initialized before creating the character device. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 87dd4c79e65f..f565212a9e32 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2464,6 +2464,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release; + kref_init(&dev->kref); result = nvme_dev_start(dev); if (result) { if (result == -EBUSY) @@ -2471,7 +2472,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto release_pools; } - kref_init(&dev->kref); result = nvme_dev_add(dev); if (result) goto shutdown; -- cgit v1.2.3 From 5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 21 Feb 2014 14:13:44 -0700 Subject: NVMe: RCU protected access to io queues This adds rcu protected access to nvme_queue to fix a race between a surprise removal freeing the queue and a thread with open reference on a NVMe block device using that queue. The queues do not need to be rcu protected during the initialization or shutdown parts, so I've added a helper function for raw deferencing to get around the sparse errors. There is still a hole in the IOCTL path for the same problem, which is fixed in a subsequent patch. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 91 +++++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 46 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f565212a9e32..b66ab1db4629 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -74,6 +74,7 @@ struct async_cmd_info { * commands and one for I/O commands). */ struct nvme_queue { + struct rcu_head r_head; struct device *q_dmadev; struct nvme_dev *dev; char irqname[24]; /* nvme4294967295-65535\0 */ @@ -262,14 +263,21 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, return ctx; } -struct nvme_queue *get_nvmeq(struct nvme_dev *dev) +static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) { - return dev->queues[get_cpu() + 1]; + return rcu_dereference_raw(dev->queues[qid]); } -void put_nvmeq(struct nvme_queue *nvmeq) +struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) +{ + rcu_read_lock(); + return rcu_dereference(dev->queues[get_cpu() + 1]); +} + +void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) { put_cpu(); + rcu_read_unlock(); } /** @@ -852,13 +860,14 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { - return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); + return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result, + ADMIN_TIMEOUT); } static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, struct nvme_command *cmd, struct async_cmd_info *cmdinfo) { - return nvme_submit_async_cmd(dev->queues[0], cmd, cmdinfo, + return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, ADMIN_TIMEOUT); } @@ -985,6 +994,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) struct nvme_command cmd; struct nvme_dev *dev = nvmeq->dev; struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + struct nvme_queue *adminq; if (!nvmeq->qid || info[cmdid].aborted) { if (work_busy(&dev->reset_work)) @@ -1001,7 +1011,8 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) if (!dev->abort_limit) return; - a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion, + adminq = rcu_dereference(dev->queues[0]); + a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, ADMIN_TIMEOUT); if (a_cmdid < 0) return; @@ -1018,7 +1029,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, nvmeq->qid); - nvme_submit_cmd(dev->queues[0], &cmd); + nvme_submit_cmd(adminq, &cmd); } /** @@ -1055,8 +1066,10 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) } } -static void nvme_free_queue(struct nvme_queue *nvmeq) +static void nvme_free_queue(struct rcu_head *r) { + struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); + spin_lock_irq(&nvmeq->q_lock); while (bio_list_peek(&nvmeq->sq_cong)) { struct bio *bio = bio_list_pop(&nvmeq->sq_cong); @@ -1075,10 +1088,13 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) { int i; + for (i = num_possible_cpus(); i > dev->queue_count - 1; i--) + rcu_assign_pointer(dev->queues[i], NULL); for (i = dev->queue_count - 1; i >= lowest; i--) { - nvme_free_queue(dev->queues[i]); + struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + rcu_assign_pointer(dev->queues[i], NULL); + call_rcu(&nvmeq->r_head, nvme_free_queue); dev->queue_count--; - dev->queues[i] = NULL; } } @@ -1116,7 +1132,7 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) static void nvme_disable_queue(struct nvme_dev *dev, int qid) { - struct nvme_queue *nvmeq = dev->queues[qid]; + struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); if (!nvmeq) return; @@ -1168,6 +1184,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->qid = qid; nvmeq->q_suspended = 1; dev->queue_count++; + rcu_assign_pointer(dev->queues[qid], nvmeq); return nvmeq; @@ -1311,12 +1328,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = dev->queues[0]; + nvmeq = raw_nvmeq(dev, 0); if (!nvmeq) { nvmeq = nvme_alloc_queue(dev, 0, 64, 0); if (!nvmeq) return -ENOMEM; - dev->queues[0] = nvmeq; } aqa = nvmeq->q_depth - 1; @@ -1581,8 +1597,8 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, if (length != cmd.data_len) status = -ENOMEM; else - status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, - timeout); + status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c, + &cmd.result, timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); @@ -1701,8 +1717,10 @@ static int nvme_kthread(void *data) queue_work(nvme_workq, &dev->reset_work); continue; } + rcu_read_lock(); for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = dev->queues[i]; + struct nvme_queue *nvmeq = + rcu_dereference(dev->queues[i]); if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); @@ -1714,6 +1732,7 @@ static int nvme_kthread(void *data) unlock: spin_unlock_irq(&nvmeq->q_lock); } + rcu_read_unlock(); } spin_unlock(&dev_list_lock); schedule_timeout(round_jiffies_relative(HZ)); @@ -1808,7 +1827,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = dev->queues[0]; + struct nvme_queue *adminq = raw_nvmeq(dev, 0); struct pci_dev *pdev = dev->pci_dev; int result, cpu, i, vecs, nr_io_queues, size, q_depth; @@ -1831,7 +1850,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) size = db_bar_size(dev, nr_io_queues); } while (1); dev->dbs = ((void __iomem *)dev->bar) + 4096; - dev->queues[0]->q_db = dev->dbs; + adminq->q_db = dev->dbs; } /* Deregister the admin queue's interrupt */ @@ -1880,19 +1899,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) } /* Free previously allocated queues that are no longer usable */ - spin_lock(&dev_list_lock); - for (i = dev->queue_count - 1; i > nr_io_queues; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; - - spin_lock_irq(&nvmeq->q_lock); - nvme_cancel_ios(nvmeq, false); - spin_unlock_irq(&nvmeq->q_lock); - - nvme_free_queue(nvmeq); - dev->queue_count--; - dev->queues[i] = NULL; - } - spin_unlock(&dev_list_lock); + nvme_free_queues(dev, nr_io_queues); cpu = cpumask_first(cpu_online_mask); for (i = 0; i < nr_io_queues; i++) { @@ -1903,8 +1910,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, NVME_Q_DEPTH); for (i = dev->queue_count - 1; i < nr_io_queues; i++) { - dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); - if (!dev->queues[i + 1]) { + if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) { result = -ENOMEM; goto free_queues; } @@ -1912,11 +1918,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) for (; i < num_possible_cpus(); i++) { int target = i % rounddown_pow_of_two(dev->queue_count - 1); - dev->queues[i + 1] = dev->queues[target + 1]; + rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]); } for (i = 1; i < dev->queue_count; i++) { - result = nvme_create_queue(dev->queues[i], i); + result = nvme_create_queue(raw_nvmeq(dev, i), i); if (result) { for (--i; i > 0; i--) nvme_disable_queue(dev, i); @@ -2180,7 +2186,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) atomic_set(&dq.refcount, 0); dq.worker = &worker; for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; + struct nvme_queue *nvmeq = raw_nvmeq(dev, i); if (nvme_suspend_queue(nvmeq)) continue; @@ -2205,7 +2211,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { for (i = dev->queue_count - 1; i >= 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; + struct nvme_queue *nvmeq = raw_nvmeq(dev, i); nvme_suspend_queue(nvmeq); nvme_clear_queue(nvmeq); } @@ -2383,18 +2389,10 @@ static int nvme_remove_dead_ctrl(void *arg) static void nvme_remove_disks(struct work_struct *ws) { - int i; struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); nvme_dev_remove(dev); - spin_lock(&dev_list_lock); - for (i = dev->queue_count - 1; i > 0; i--) { - BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended); - nvme_free_queue(dev->queues[i]); - dev->queue_count--; - dev->queues[i] = NULL; - } - spin_unlock(&dev_list_lock); + nvme_free_queues(dev, 1); } static int nvme_dev_resume(struct nvme_dev *dev) @@ -2526,6 +2524,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove(dev); nvme_dev_shutdown(dev); nvme_free_queues(dev, 0); + rcu_barrier(); nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); -- cgit v1.2.3 From 4f5099af4f3d5f999d8ab7784472d93e810e3912 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 3 Mar 2014 16:39:13 -0700 Subject: NVMe: IOCTL path RCU protect queue access This adds rcu protected access to a queue in the nvme IOCTL path to fix potential races between a surprise removal and queue usage in nvme_submit_sync_cmd. The fix holds the rcu_read_lock() here to prevent the nvme_queue from freeing while this path is executing so it can't sleep, and so this path will no longer wait for a available command id should they all be in use at the time a passthrough IOCTL request is received. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 82 +++++++++++++++++++++++++++++++---------------- drivers/block/nvme-scsi.c | 31 +++--------------- 2 files changed, 59 insertions(+), 54 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index b66ab1db4629..04664cadadfa 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -268,18 +268,30 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) return rcu_dereference_raw(dev->queues[qid]); } -struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) +static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) { rcu_read_lock(); return rcu_dereference(dev->queues[get_cpu() + 1]); } -void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) { put_cpu(); rcu_read_unlock(); } +static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) + __acquires(RCU) +{ + rcu_read_lock(); + return rcu_dereference(dev->queues[q_idx]); +} + +static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +{ + rcu_read_unlock(); +} + /** * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use @@ -292,6 +304,10 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) unsigned long flags; u16 tail; spin_lock_irqsave(&nvmeq->q_lock, flags); + if (nvmeq->q_suspended) { + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + return -EBUSY; + } tail = nvmeq->sq_tail; memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); if (++tail == nvmeq->q_depth) @@ -812,27 +828,46 @@ static void sync_completion(struct nvme_dev *dev, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, +static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, + struct nvme_command *cmd, u32 *result, unsigned timeout) { - int cmdid; + int cmdid, ret; struct sync_cmd_info cmdinfo; + struct nvme_queue *nvmeq; + + nvmeq = lock_nvmeq(dev, q_idx); + if (!nvmeq) { + unlock_nvmeq(nvmeq); + return -ENODEV; + } cmdinfo.task = current; cmdinfo.status = -EINTR; - cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, - timeout); - if (cmdid < 0) + cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); + if (cmdid < 0) { + unlock_nvmeq(nvmeq); return cmdid; + } cmd->common.command_id = cmdid; set_current_state(TASK_KILLABLE); - nvme_submit_cmd(nvmeq, cmd); + ret = nvme_submit_cmd(nvmeq, cmd); + if (ret) { + free_cmdid(nvmeq, cmdid, NULL); + unlock_nvmeq(nvmeq); + set_current_state(TASK_RUNNING); + return ret; + } + unlock_nvmeq(nvmeq); schedule_timeout(timeout); if (cmdinfo.status == -EINTR) { - nvme_abort_command(nvmeq, cmdid); + nvmeq = lock_nvmeq(dev, q_idx); + if (nvmeq) + nvme_abort_command(nvmeq, cmdid); + unlock_nvmeq(nvmeq); return -EINTR; } @@ -853,15 +888,20 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, return cmdid; cmdinfo->status = -EINTR; cmd->common.command_id = cmdid; - nvme_submit_cmd(nvmeq, cmd); - return 0; + return nvme_submit_cmd(nvmeq, cmd); } int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { - return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result, - ADMIN_TIMEOUT); + return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); +} + +int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result) +{ + return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, + NVME_IO_TIMEOUT); } static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, @@ -1434,7 +1474,6 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int write, static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_dev *dev = ns->dev; - struct nvme_queue *nvmeq; struct nvme_user_io io; struct nvme_command c; unsigned length, meta_len; @@ -1510,20 +1549,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); - nvmeq = get_nvmeq(dev); - /* - * Since nvme_submit_sync_cmd sleeps, we can't keep preemption - * disabled. We may be preempted at any point, and be rescheduled - * to a different CPU. That will cause cacheline bouncing, but no - * additional races since q_lock already protects against other CPUs. - */ - put_nvmeq(nvmeq); if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; - else if (!nvmeq || nvmeq->q_suspended) - status = -EBUSY; else - status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + status = nvme_submit_io_cmd(dev, &c, NULL); if (meta_len) { if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { @@ -1597,8 +1626,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, if (length != cmd.data_len) status = -ENOMEM; else - status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c, - &cmd.result, timeout); + status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 4a0ceb64e269..e157e85bb5d7 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -2033,7 +2033,6 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; struct nvme_dev *dev = ns->dev; - struct nvme_queue *nvmeq; u32 num_cmds; struct nvme_iod *iod; u64 unit_len; @@ -2106,18 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_offset += unit_num_blocks; - nvmeq = get_nvmeq(dev); - /* - * Since nvme_submit_sync_cmd sleeps, we can't keep - * preemption disabled. We may be preempted at any - * point, and be rescheduled to a different CPU. That - * will cause cacheline bouncing, but no additional - * races since q_lock already protects against other - * CPUs. - */ - put_nvmeq(nvmeq); - nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, - NVME_IO_TIMEOUT); + nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); if (nvme_sc != NVME_SC_SUCCESS) { nvme_unmap_user_pages(dev, (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, @@ -2644,7 +2632,6 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; - struct nvme_queue *nvmeq; struct nvme_command c; u8 immed, pcmod, pc, no_flush, start; @@ -2671,10 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvmeq = get_nvmeq(ns->dev); - put_nvmeq(nvmeq); - nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); - + nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out; @@ -2697,15 +2681,12 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; struct nvme_command c; - struct nvme_queue *nvmeq; memset(&c, 0, sizeof(c)); c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvmeq = get_nvmeq(ns->dev); - put_nvmeq(nvmeq); - nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -2872,7 +2853,6 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_dev *dev = ns->dev; struct scsi_unmap_parm_list *plist; struct nvme_dsm_range *range; - struct nvme_queue *nvmeq; struct nvme_command c; int i, nvme_sc, res = -ENOMEM; u16 ndesc, list_len; @@ -2914,10 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.dsm.nr = cpu_to_le32(ndesc - 1); c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - nvmeq = get_nvmeq(dev); - put_nvmeq(nvmeq); - - nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), -- cgit v1.2.3 From ddcb776286c091189a7b928188112470ec7e9efc Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Mar 2014 10:03:56 -0400 Subject: NVMe: Fix divide-by-zero in nvme_trans_io_get_num_cmds dev->max_hw_sectors may be zero to indicate the device has no limit on the number of sectors. nvme_trans_do_nvme_io() should use the software limit, since this is guaranteed to be non-zero. Reported-by: Mundu Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index e157e85bb5d7..111c920c1574 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -2044,7 +2044,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_command c; u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); u16 control; - u32 max_blocks = nvme_block_nr(ns, dev->max_hw_sectors); + u32 max_blocks = queue_max_hw_sectors(ns->queue); num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); -- cgit v1.2.3 From 6eb0d698efa9c2a35ec3ca958699717c603f85ee Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 24 Mar 2014 10:11:22 -0400 Subject: NVMe: Replace DEFINE_PCI_DEVICE_TABLE Checkpatch has started warning against using DEFINE_PCI_DEVICE_TABLE, so replace it. Also update the copyright date and bump the module version number to 0.9. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 04664cadadfa..e9495f0bfad3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1,6 +1,6 @@ /* * NVM Express device driver - * Copyright (c) 2011, Intel Corporation. + * Copyright (c) 2011-2014, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -2601,7 +2601,7 @@ static const struct pci_error_handlers nvme_err_handler = { /* Move to pci_ids.h later */ #define PCI_CLASS_STORAGE_EXPRESS 0x010802 -static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { +static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { 0, } }; @@ -2662,6 +2662,6 @@ static void __exit nvme_exit(void) MODULE_AUTHOR("Matthew Wilcox "); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.8"); +MODULE_VERSION("0.9"); module_init(nvme_init); module_exit(nvme_exit); -- cgit v1.2.3 From e25115786ee540fc428a14872ebd4f56252aba32 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Mar 2014 20:41:36 -0500 Subject: switch nbd to sockfd_lookup/sockfd_put Signed-off-by: Al Viro --- drivers/block/nbd.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 55298db36b2d..3a70ea2f7cd6 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -630,37 +630,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, } case NBD_CLEAR_SOCK: { - struct file *file; - + struct socket *sock = nbd->sock; nbd->sock = NULL; - file = nbd->file; - nbd->file = NULL; nbd_clear_que(nbd); BUG_ON(!list_empty(&nbd->queue_head)); BUG_ON(!list_empty(&nbd->waiting_queue)); kill_bdev(bdev); - if (file) - fput(file); + if (sock) + sockfd_put(sock); return 0; } case NBD_SET_SOCK: { - struct file *file; - if (nbd->file) + struct socket *sock; + int err; + if (nbd->sock) return -EBUSY; - file = fget(arg); - if (file) { - struct inode *inode = file_inode(file); - if (S_ISSOCK(inode->i_mode)) { - nbd->file = file; - nbd->sock = SOCKET_I(inode); - if (max_part > 0) - bdev->bd_invalidated = 1; - nbd->disconnect = 0; /* we're connected now */ - return 0; - } else { - fput(file); - } + sock = sockfd_lookup(arg, &err); + if (sock) { + nbd->sock = sock; + if (max_part > 0) + bdev->bd_invalidated = 1; + nbd->disconnect = 0; /* we're connected now */ + return 0; } return -EINVAL; } @@ -697,12 +689,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_DO_IT: { struct task_struct *thread; - struct file *file; + struct socket *sock; int error; if (nbd->pid) return -EBUSY; - if (!nbd->file) + if (!nbd->sock) return -EINVAL; mutex_unlock(&nbd->tx_lock); @@ -731,15 +723,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, if (error) return error; sock_shutdown(nbd, 0); - file = nbd->file; - nbd->file = NULL; + sock = nbd->sock; + nbd->sock = NULL; nbd_clear_que(nbd); dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); kill_bdev(bdev); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); set_device_ro(bdev, false); - if (file) - fput(file); + if (sock) + sockfd_put(sock); nbd->flags = 0; nbd->bytesize = 0; bdev->bd_inode->i_size = 0; @@ -875,9 +867,7 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; - nbd_dev[i].file = NULL; nbd_dev[i].magic = NBD_MAGIC; - nbd_dev[i].flags = 0; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); INIT_LIST_HEAD(&nbd_dev[i].queue_head); -- cgit v1.2.3 From f730c848affc05fb7262574b06e0cd7e1fa96096 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 21:07:38 -0500 Subject: drbd: don't open-code kernel_recvmsg() Signed-off-by: Al Viro --- drivers/block/drbd/drbd_receiver.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d073305ffd5e..1385714eccb7 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -468,24 +468,14 @@ static void drbd_wait_ee_list_empty(struct drbd_conf *mdev, static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) { - mm_segment_t oldfs; struct kvec iov = { .iov_base = buf, .iov_len = size, }; struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&iov, .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) }; - int rv; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); - set_fs(oldfs); - - return rv; + return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); } static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size) -- cgit v1.2.3 From 62054da65c626dd603190c16805f92cf2cf47d4c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 4 Mar 2014 11:57:17 +0200 Subject: rbd: remove out_partial label in rbd_img_request_fill() Commit 03507db631c94 ("rbd: fix buffer size for writes to images with snapshots") moved the call to rbd_img_obj_request_add() up, making the out_partial label bogus. Remove it. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 34898d53395b..43b9814edf97 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2190,6 +2190,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, rbd_segment_name_free(object_name); if (!obj_request) goto out_unwind; + /* * set obj_request->img_request before creating the * osd_request so that it gets the right snapc @@ -2207,7 +2208,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, clone_size, GFP_ATOMIC); if (!obj_request->bio_list) - goto out_partial; + goto out_unwind; } else { unsigned int page_count; @@ -2222,7 +2223,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, osd_req = rbd_osd_req_create(rbd_dev, write_request, obj_request); if (!osd_req) - goto out_partial; + goto out_unwind; obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; @@ -2249,8 +2250,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, return 0; -out_partial: - rbd_obj_request_put(obj_request); out_unwind: for_each_obj_request_safe(img_request, obj_request, next_obj_request) rbd_obj_request_put(obj_request); -- cgit v1.2.3 From 42dd037c08c7cd6e3e9af7824b0c1d063f838885 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 4 Mar 2014 11:57:17 +0200 Subject: rbd: fix error paths in rbd_img_request_fill() Doing rbd_obj_request_put() in rbd_img_request_fill() error paths is not only insufficient, but also triggers an rbd_assert() in rbd_obj_request_destroy(): Assertion failure in rbd_obj_request_destroy() at line 1867: rbd_assert(obj_request->img_request == NULL); rbd_img_obj_request_add() adds obj_requests to the img_request, the opposite is rbd_img_obj_request_del(). Use it. Fixes: http://tracker.ceph.com/issues/7327 Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 43b9814edf97..b55b7812cf93 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2252,7 +2252,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, out_unwind: for_each_obj_request_safe(img_request, obj_request, next_obj_request) - rbd_obj_request_put(obj_request); + rbd_img_obj_request_del(img_request, obj_request); return -ENOMEM; } -- cgit v1.2.3 From 7cc69d42e6950404587bef9489a5ed6f9f6bab4e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:27 +0200 Subject: libceph: bump CEPH_OSD_MAX_OP to 3 Our longest osd request now contains 3 ops: copyup+hint+write. Also, CEPH_OSD_MAX_OP value in a BUG_ON in rbd_osd_req_callback() was hard-coded to 2. Fix it, and switch to rbd_assert while at it. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b55b7812cf93..4c612c4041b6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1654,7 +1654,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; - BUG_ON(osd_req->r_num_ops > 2); + rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); /* * We support a 64-bit length, but ultimately it has to be -- cgit v1.2.3 From deb236b300cea3e7a114115571194b9872dbdfd1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:27 +0200 Subject: rbd: num_ops parameter for rbd_osd_req_create() In preparation for prefixing rbd writes with an allocation hint introduce a num_ops parameter for rbd_osd_req_create(). The rationale is that not every write request is a write op that needs to be prefixed (e.g. watch op), so the num_ops logic needs to be in the callers. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- drivers/block/rbd.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4c612c4041b6..42ecae1436cc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1718,6 +1718,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, bool write_request, + unsigned int num_ops, struct rbd_obj_request *obj_request) { struct ceph_snap_context *snapc = NULL; @@ -1733,10 +1734,13 @@ static struct ceph_osd_request *rbd_osd_req_create( snapc = img_request->snapc; } - /* Allocate and initialize the request, for the single op */ + rbd_assert(num_ops == 1); + + /* Allocate and initialize the request, for the num_ops ops */ osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, + GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ @@ -2220,8 +2224,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, pages += page_count; } - osd_req = rbd_osd_req_create(rbd_dev, write_request, - obj_request); + osd_req = rbd_osd_req_create(rbd_dev, write_request, 1, + obj_request); if (!osd_req) goto out_unwind; obj_request->osd_req = osd_req; @@ -2602,8 +2606,8 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) rbd_assert(obj_request->img_request); rbd_dev = obj_request->img_request->rbd_dev; - stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, - stat_request); + stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + stat_request); if (!stat_request->osd_req) goto out; stat_request->callback = rbd_img_obj_exists_callback; @@ -2806,7 +2810,8 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) return -ENOMEM; ret = -ENOMEM; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request); if (!obj_request->osd_req) goto out; @@ -2869,7 +2874,8 @@ static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) if (!obj_request) goto out_cancel; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, + obj_request); if (!obj_request->osd_req) goto out_cancel; @@ -2977,7 +2983,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request); if (!obj_request->osd_req) goto out; @@ -3210,7 +3217,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request); if (!obj_request->osd_req) goto out; -- cgit v1.2.3 From 0ccd59266973047770d5160318561c9189b79c93 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Feb 2014 16:22:28 +0200 Subject: rbd: prefix rbd writes with CEPH_OSD_OP_SETALLOCHINT osd op In an effort to reduce fragmentation, prefix every rbd write with a CEPH_OSD_OP_SETALLOCHINT osd op with an expected_write_size value set to the object size (1 << order). Backwards compatibility is taken care of on the libceph/osd side. "The CEPH_OSD_OP_SETALLOCHINT hint is durable, in that it's enough to do it once. The reason every rbd write is prefixed is that rbd doesn't explicitly create objects and relies on writes creating them implicitly, so there is no place to stick a single hint op into. To get around that we decided to prefix every rbd write with a hint (just like write and setattr ops, hint op will create an object implicitly if it doesn't exist)." Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- drivers/block/rbd.c | 54 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 42ecae1436cc..4c95b503b09e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1662,11 +1662,15 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, */ obj_request->xferred = osd_req->r_reply_op_len[0]; rbd_assert(obj_request->xferred < (u64)UINT_MAX); + opcode = osd_req->r_ops[0].op; switch (opcode) { case CEPH_OSD_OP_READ: rbd_osd_read_callback(obj_request); break; + case CEPH_OSD_OP_SETALLOCHINT: + rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE); + /* fall through */ case CEPH_OSD_OP_WRITE: rbd_osd_write_callback(obj_request); break; @@ -1715,6 +1719,12 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) snapc, CEPH_NOSNAP, &mtime); } +/* + * Create an osd request. A read request has one osd op (read). + * A write request has either one (watch) or two (hint+write) osd ops. + * (All rbd data writes are prefixed with an allocation hint op, but + * technically osd watch is a write request, hence this distinction.) + */ static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, bool write_request, @@ -1734,7 +1744,7 @@ static struct ceph_osd_request *rbd_osd_req_create( snapc = img_request->snapc; } - rbd_assert(num_ops == 1); + rbd_assert(num_ops == 1 || (write_request && num_ops == 2)); /* Allocate and initialize the request, for the num_ops ops */ @@ -1760,8 +1770,8 @@ static struct ceph_osd_request *rbd_osd_req_create( /* * Create a copyup osd request based on the information in the - * object request supplied. A copyup request has two osd ops, - * a copyup method call, and a "normal" write request. + * object request supplied. A copyup request has three osd ops, + * a copyup method call, a hint op, and a write op. */ static struct ceph_osd_request * rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) @@ -1777,12 +1787,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) rbd_assert(img_request); rbd_assert(img_request_write_test(img_request)); - /* Allocate and initialize the request, for the two ops */ + /* Allocate and initialize the request, for the three ops */ snapc = img_request->snapc; rbd_dev = img_request->rbd_dev; osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); + osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ @@ -2182,6 +2192,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, const char *object_name; u64 offset; u64 length; + unsigned int which = 0; object_name = rbd_segment_name(rbd_dev, img_offset); if (!object_name) @@ -2224,20 +2235,28 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, pages += page_count; } - osd_req = rbd_osd_req_create(rbd_dev, write_request, 1, + osd_req = rbd_osd_req_create(rbd_dev, write_request, + (write_request ? 2 : 1), obj_request); if (!osd_req) goto out_unwind; obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; - osd_req_op_extent_init(osd_req, 0, opcode, offset, length, - 0, 0); + if (write_request) { + osd_req_op_alloc_hint_init(osd_req, which, + rbd_obj_bytes(&rbd_dev->header), + rbd_obj_bytes(&rbd_dev->header)); + which++; + } + + osd_req_op_extent_init(osd_req, which, opcode, offset, length, + 0, 0); if (type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, 0, + osd_req_op_extent_osd_data_bio(osd_req, which, obj_request->bio_list, length); else - osd_req_op_extent_osd_data_pages(osd_req, 0, + osd_req_op_extent_osd_data_pages(osd_req, which, obj_request->pages, length, offset & ~PAGE_MASK, false, false); @@ -2356,7 +2375,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) /* * The original osd request is of no use to use any more. - * We need a new one that can hold the two ops in a copyup + * We need a new one that can hold the three ops in a copyup * request. Allocate the new copyup osd request for the * original request, and release the old one. */ @@ -2375,17 +2394,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, false, false); - /* Then the original write request op */ + /* Then the hint op */ + + osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header), + rbd_obj_bytes(&rbd_dev->header)); + + /* And the original write request op */ offset = orig_request->offset; length = orig_request->length; - osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, + osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE, offset, length, 0, 0); if (orig_request->type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, 1, + osd_req_op_extent_osd_data_bio(osd_req, 2, orig_request->bio_list, length); else - osd_req_op_extent_osd_data_pages(osd_req, 1, + osd_req_op_extent_osd_data_pages(osd_req, 2, orig_request->pages, length, offset & ~PAGE_MASK, false, false); -- cgit v1.2.3 From be2d1d56c82d8cf20e6c77515eb499f8e86eb5be Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:00 -0700 Subject: zram: drop `init_done' struct zram member Introduce init_done() helper function which allows us to drop `init_done' struct zram member. init_done() uses the fact that ->init_done == 1 equals to ->meta != NULL. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 21 +++++++++++---------- drivers/block/zram/zram_drv.h | 1 - 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 51c557cfd92b..b3e6f072c19b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -42,6 +42,11 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static inline int init_done(struct zram *zram) +{ + return zram->meta != NULL; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -60,7 +65,7 @@ static ssize_t initstate_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", zram->init_done); + return sprintf(buf, "%u\n", init_done(zram)); } static ssize_t num_reads_show(struct device *dev, @@ -133,7 +138,7 @@ static ssize_t mem_used_total_show(struct device *dev, struct zram_meta *meta = zram->meta; down_read(&zram->init_lock); - if (zram->init_done) + if (init_done(zram)) val = zs_get_total_size_bytes(meta->mem_pool); up_read(&zram->init_lock); @@ -546,14 +551,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) struct zram_meta *meta; down_write(&zram->init_lock); - if (!zram->init_done) { + if (!init_done(zram)) { up_write(&zram->init_lock); return; } meta = zram->meta; - zram->init_done = 0; - /* Free all pages that are still in this zram device */ for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { unsigned long handle = meta->table[index].handle; @@ -594,8 +597,6 @@ static void zram_init_device(struct zram *zram, struct zram_meta *meta) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); zram->meta = meta; - zram->init_done = 1; - pr_debug("Initialization done!\n"); } @@ -615,7 +616,7 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; down_write(&zram->init_lock); - if (zram->init_done) { + if (init_done(zram)) { up_write(&zram->init_lock); zram_meta_free(meta); pr_info("Cannot change disksize for initialized device\n"); @@ -736,7 +737,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) struct zram *zram = queue->queuedata; down_read(&zram->init_lock); - if (unlikely(!zram->init_done)) + if (unlikely(!init_done(zram))) goto error; if (!valid_io_request(zram, bio)) { @@ -859,7 +860,7 @@ static int create_device(struct zram *zram, int device_id) goto out_free_disk; } - zram->init_done = 0; + zram->meta = NULL; return 0; out_free_disk: diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ad8aa35bae00..e81e9cdf4147 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -95,7 +95,6 @@ struct zram { struct zram_meta *meta; struct request_queue *queue; struct gendisk *disk; - int init_done; /* Prevent concurrent execution of device init, reset and R/W request */ struct rw_semaphore init_lock; /* -- cgit v1.2.3 From be257c61306750d11c20d2ac567bf63304c696a3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:01 -0700 Subject: zram: do not pass rw argument to __zram_make_request() Do not pass rw argument down the __zram_make_request() -> zram_bvec_rw() chain, decode it in zram_bvec_rw() instead. Besides, this is the place where we distinguish READ and WRITE bio data directions, so account zram RW stats here, instead of __zram_make_request(). This also allows to account a real number of zram READ/WRITE operations, not just requests (single RW request may cause a number of zram RW ops with separate locking, compression/decompression, etc). Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b3e6f072c19b..3f0d6de36f74 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -533,14 +533,18 @@ out: } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio, int rw) + int offset, struct bio *bio) { int ret; + int rw = bio_data_dir(bio); - if (rw == READ) + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset, bio); - else + } else { + atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); + } return ret; } @@ -672,22 +676,13 @@ out: return ret; } -static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) +static void __zram_make_request(struct zram *zram, struct bio *bio) { int offset; u32 index; struct bio_vec bvec; struct bvec_iter iter; - switch (rw) { - case READ: - atomic64_inc(&zram->stats.num_reads); - break; - case WRITE: - atomic64_inc(&zram->stats.num_writes); - break; - } - index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; @@ -706,16 +701,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, bio, rw) - < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) goto out; update_position(&index, &offset, &bvec); @@ -745,7 +739,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) goto error; } - __zram_make_request(zram, bio, bio_data_dir(bio)); + __zram_make_request(zram, bio); up_read(&zram->init_lock); return; -- cgit v1.2.3 From b7cccf8b4009bf74df61f3c9d86b95fabd807c11 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:02 -0700 Subject: zram: remove good and bad compress stats Remove `good' and `bad' compressed sub-requests stats. RW request may cause a number of RW sub-requests. zram used to account `good' compressed sub-queries (with compressed size less than 50% of original size), `bad' compressed sub-queries (with compressed size greater that 75% of original size), leaving sub-requests with compression size between 50% and 75% of original size not accounted and not reported. zram already accounts each sub-request's compression size so we can calculate real device compression ratio. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 11 ----------- drivers/block/zram/zram_drv.h | 2 -- 2 files changed, 13 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3f0d6de36f74..995304ef40bb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -293,7 +293,6 @@ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; unsigned long handle = meta->table[index].handle; - u16 size = meta->table[index].size; if (unlikely(!handle)) { /* @@ -307,14 +306,8 @@ static void zram_free_page(struct zram *zram, size_t index) return; } - if (unlikely(size > max_zpage_size)) - atomic_dec(&zram->stats.bad_compress); - zs_free(meta->mem_pool, handle); - if (size <= PAGE_SIZE / 2) - atomic_dec(&zram->stats.good_compress); - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); atomic_dec(&zram->stats.pages_stored); @@ -478,7 +471,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (unlikely(clen > max_zpage_size)) { - atomic_inc(&zram->stats.bad_compress); clen = PAGE_SIZE; src = NULL; if (is_partial_io(bvec)) @@ -518,9 +510,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, /* Update stats */ atomic64_add(clen, &zram->stats.compr_size); atomic_inc(&zram->stats.pages_stored); - if (clen <= PAGE_SIZE / 2) - atomic_inc(&zram->stats.good_compress); - out: if (locked) mutex_unlock(&meta->buffer_lock); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e81e9cdf4147..2f173cb1fd0a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,8 +78,6 @@ struct zram_stats { atomic64_t notify_free; /* no. of swap slot free notifications */ atomic_t pages_zero; /* no. of zero filled pages */ atomic_t pages_stored; /* no. of pages currently stored */ - atomic_t good_compress; /* % of pages with compression ratio<=50% */ - atomic_t bad_compress; /* % of pages with compression ratio>=75% */ }; struct zram_meta { -- cgit v1.2.3 From 90a7806ea9b9f7cb4751859cc2506e2d80e36ef1 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:03 -0700 Subject: zram: use atomic64_t for all zram stats This is a preparation patch for stats code duplication removal. 1) use atomic64_t for `pages_zero' and `pages_stored' zram stats. 2) `compr_size' and `pages_zero' struct zram_stats members did not follow the existing device attr naming scheme: zram_stats.ATTR has ATTR_show() function. rename them: -- compr_size -> compr_data_size -- pages_zero -> zero_pages Minchan Kim's note: If we really have trouble with atomic stat operation, we could change it with percpu_counter so that it could solve atomic overhead and unnecessary memory space by introducing unsigned long instead of 64bit atomic_t. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 18 +++++++++--------- drivers/block/zram/zram_drv.h | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 995304ef40bb..1bf97b2f8f3f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -109,7 +109,7 @@ static ssize_t zero_pages_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero)); + return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages)); } static ssize_t orig_data_size_show(struct device *dev, @@ -118,7 +118,7 @@ static ssize_t orig_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT); + (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } static ssize_t compr_data_size_show(struct device *dev, @@ -127,7 +127,7 @@ static ssize_t compr_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_size)); + (u64)atomic64_read(&zram->stats.compr_data_size)); } static ssize_t mem_used_total_show(struct device *dev, @@ -301,15 +301,15 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(meta, index, ZRAM_ZERO)) { zram_clear_flag(meta, index, ZRAM_ZERO); - atomic_dec(&zram->stats.pages_zero); + atomic64_dec(&zram->stats.zero_pages); } return; } zs_free(meta->mem_pool, handle); - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - atomic_dec(&zram->stats.pages_stored); + atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; meta->table[index].size = 0; @@ -452,7 +452,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, zram_set_flag(meta, index, ZRAM_ZERO); write_unlock(&zram->meta->tb_lock); - atomic_inc(&zram->stats.pages_zero); + atomic64_inc(&zram->stats.zero_pages); ret = 0; goto out; } @@ -508,8 +508,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, write_unlock(&zram->meta->tb_lock); /* Update stats */ - atomic64_add(clen, &zram->stats.compr_size); - atomic_inc(&zram->stats.pages_stored); + atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); out: if (locked) mutex_unlock(&meta->buffer_lock); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 2f173cb1fd0a..58d4ac537f65 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -69,15 +69,15 @@ struct table { } __aligned(4); struct zram_stats { - atomic64_t compr_size; /* compressed size of pages stored */ + atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ atomic64_t failed_reads; /* should NEVER! happen */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - atomic_t pages_zero; /* no. of zero filled pages */ - atomic_t pages_stored; /* no. of pages currently stored */ + atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t pages_stored; /* no. of pages currently stored */ }; struct zram_meta { -- cgit v1.2.3 From a68eb3b65e658406d386bebef02277f4007b2f45 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:04 -0700 Subject: zram: remove zram stats code duplication Introduce ZRAM_ATTR_RO macro that generates device_attribute and default ATTR show() function for existing atomic64_t zram stats. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 82 ++++++++++++------------------------------- 1 file changed, 23 insertions(+), 59 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1bf97b2f8f3f..29c35119e82a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -42,6 +42,17 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +#define ZRAM_ATTR_RO(name) \ +static ssize_t zram_attr_##name##_show(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + struct zram *zram = dev_to_zram(d); \ + return sprintf(b, "%llu\n", \ + (u64)atomic64_read(&zram->stats.name)); \ +} \ +static struct device_attribute dev_attr_##name = \ + __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); + static inline int init_done(struct zram *zram) { return zram->meta != NULL; @@ -63,53 +74,14 @@ static ssize_t disksize_show(struct device *dev, static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { + u32 val; struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", init_done(zram)); -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); + down_read(&zram->init_lock); + val = init_done(zram); + up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages)); + return sprintf(buf, "%u\n", val); } static ssize_t orig_data_size_show(struct device *dev, @@ -121,15 +93,6 @@ static ssize_t orig_data_size_show(struct device *dev, (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_data_size)); -} - static ssize_t mem_used_total_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -762,15 +725,16 @@ static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, disksize_show, disksize_store); static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, &dev_attr_initstate.attr, -- cgit v1.2.3 From 6444724939db5de7390c90f7b4a657159b3b4465 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:05 -0700 Subject: zram: report failed read and write stats zram accounted but did not report numbers of failed read and write queries. make these stats available as failed_reads and failed_writes attrs. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 29c35119e82a..c06d7fba4237 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -730,6 +730,8 @@ static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); ZRAM_ATTR_RO(invalid_io); ZRAM_ATTR_RO(notify_free); ZRAM_ATTR_RO(zero_pages); @@ -741,6 +743,8 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_reset.attr, &dev_attr_num_reads.attr, &dev_attr_num_writes.attr, + &dev_attr_failed_reads.attr, + &dev_attr_failed_writes.attr, &dev_attr_invalid_io.attr, &dev_attr_notify_free.attr, &dev_attr_zero_pages.attr, -- cgit v1.2.3 From 59fc86a4922f1a1c0f69eac758a7e2b2b138aab4 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:06 -0700 Subject: zram: drop not used table `count' member struct table `count' member is not used. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 58d4ac537f65..1d5b1f5786a8 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -64,7 +64,6 @@ enum zram_pageflags { struct table { unsigned long handle; u16 size; /* object size (excluding header) */ - u8 count; /* object ref count (not yet used) */ u8 flags; } __aligned(4); -- cgit v1.2.3 From e64cd51d2fa87733176246101df871a8ac5c7c20 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:07 -0700 Subject: zram: move zram size warning to documentation Move zram warning about disksize and size of memory correlation to zram documentation. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c06d7fba4237..21aee3edcb25 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -535,23 +535,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) static void zram_init_device(struct zram *zram, struct zram_meta *meta) { - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - zram->meta = meta; pr_debug("Initialization done!\n"); } -- cgit v1.2.3 From b67d1ec189ffb92cdad9b2bd29475fb1e0166983 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:09 -0700 Subject: zram: delete zram_init_device() allocate new `zram_meta' in disksize_store() only for uninitialised zram device, saving a number of allocations and deallocations in case if disksize_store() was called on currently used device. at the same time zram_meta stack variable is not necessary, because we can set ->meta directly. there is also no need in setting QUEUE_FLAG_NONROT queue on every disksize_store(), set it once during device creation. [minchan@kernel.org: handle zram->meta alloc fail case] [minchan@kernel.org: prevent lockdep spew of init_lock] Signed-off-by: Sergey Senozhatsky Signed-off-by: Minchan Kim Acked-by: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 21aee3edcb25..76ba67673a90 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -533,14 +533,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) up_write(&zram->init_lock); } -static void zram_init_device(struct zram *zram, struct zram_meta *meta) -{ - /* zram devices sort of resembles non-rotational disks */ - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - zram->meta = meta; - pr_debug("Initialization done!\n"); -} - static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -556,17 +548,18 @@ static ssize_t disksize_store(struct device *dev, meta = zram_meta_alloc(disksize); if (!meta) return -ENOMEM; + down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); zram_meta_free(meta); + up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); return -EBUSY; } + zram->meta = meta; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_init_device(zram, meta); up_write(&zram->init_lock); return len; @@ -776,7 +769,8 @@ static int create_device(struct zram *zram, int device_id) /* Actual capacity set using syfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); - + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. -- cgit v1.2.3 From e7e1ef439d18f9a21521116ea9f2b976d7230e54 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:11 -0700 Subject: zram: introduce compressing backend abstraction ZRAM performs direct LZO compression algorithm calls, making it the one and only option. While LZO is generally performs well, LZ4 algorithm tends to have a faster decompression (see http://code.google.com/p/lz4/ for full report) Name Ratio C.speed D.speed MB/s MB/s LZ4 (r101) 2.084 422 1820 LZO 2.06 2.106 414 600 Thus, users who have mostly read (decompress) usage scenarious or mixed workflow (writes with relatively high read ops number) will benefit from using LZ4 compression backend. Introduce compressing backend abstraction zcomp in order to support multiple compression algorithms with the following set of operations: .create .destroy .compress .decompress Schematically zram write() usually contains the following steps: 0) preparation (decompression of partioal IO, etc.) 1) lock buffer_lock mutex (protects meta compress buffers) 2) compress (using meta compress buffers) 3) alloc and map zs_pool object 4) copy compressed data (from meta compress buffers) to object allocated by 3) 5) free previous pool page, assign a new one 6) unlock buffer_lock mutex As we can see, compressing buffers must remain untouched from 1) to 4), because, otherwise, concurrent write() can overwrite data. At the same time, zram_meta must be aware of a) specific compression algorithm memory requirements and b) necessary locking to protect compression buffers. To remove requirement a) new struct zcomp_strm introduced, which contains a compress/decompress `buffer' and compression algorithm `private' part. While struct zcomp implements zcomp_strm stream handling and locking and removes requirement b) from zram meta. zcomp ->create() and ->destroy(), respectively, allocate and deallocate algorithm specific zcomp_strm `private' part. Every zcomp has zcomp stream and mutex to protect its compression stream. Stream usage semantics remains the same -- only one write can hold stream lock and use its buffers. zcomp_strm_find() turns caller into exclusive user of a stream (holding stream mutex until zram release stream), and zcomp_strm_release() makes zcomp stream available (unlock the stream mutex). Hence no concurrent write (compression) operations possible at the moment. iozone -t 3 -R -r 16K -s 60M -I +Z test base patched -------------------------------------------------- Initial write 597992.91 591660.58 Rewrite 609674.34 616054.97 Read 2404771.75 2452909.12 Re-read 2459216.81 2470074.44 Reverse Read 1652769.66 1589128.66 Stride read 2202441.81 2202173.31 Random read 2236311.47 2276565.31 Mixed workload 1423760.41 1709760.06 Random write 579584.08 615933.86 Pwrite 597550.02 594933.70 Pread 1703672.53 1718126.72 Fwrite 1330497.06 1461054.00 Fread 3922851.00 3957242.62 Usage examples: comp = zcomp_create(NAME) /* NAME e.g. "lzo" */ which initialises compressing backend if requested algorithm is supported. Compress: zstrm = zcomp_strm_find(comp) zcomp_compress(comp, zstrm, src, &dst_len) [..] /* copy compressed data */ zcomp_strm_release(comp, zstrm) Decompress: zcomp_decompress(comp, src, src_len, dst); Free compessing backend and its zcomp stream: zcomp_destroy(comp) Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 115 +++++++++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp.h | 58 +++++++++++++++++++++ drivers/block/zram/zcomp_lzo.c | 47 +++++++++++++++++ drivers/block/zram/zcomp_lzo.h | 17 ++++++ 4 files changed, 237 insertions(+) create mode 100644 drivers/block/zram/zcomp.c create mode 100644 drivers/block/zram/zcomp.h create mode 100644 drivers/block/zram/zcomp_lzo.c create mode 100644 drivers/block/zram/zcomp_lzo.h (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c new file mode 100644 index 000000000000..22f4ae235660 --- /dev/null +++ b/drivers/block/zram/zcomp.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp.h" +#include "zcomp_lzo.h" + +static struct zcomp_backend *find_backend(const char *compress) +{ + if (strncmp(compress, "lzo", 3) == 0) + return &zcomp_lzo; + return NULL; +} + +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + if (zstrm->private) + comp->backend->destroy(zstrm->private); + free_pages((unsigned long)zstrm->buffer, 1); + kfree(zstrm); +} + +/* + * allocate new zcomp_strm structure with ->private initialized by + * backend, return NULL on error + */ +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +{ + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); + if (!zstrm) + return NULL; + + zstrm->private = comp->backend->create(); + /* + * allocate 2 pages. 1 for compressed data, plus 1 extra for the + * case when compressed size is larger than the original one + */ + zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!zstrm->private || !zstrm->buffer) { + zcomp_strm_free(comp, zstrm); + zstrm = NULL; + } + return zstrm; +} + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +{ + mutex_lock(&comp->strm_lock); + return comp->zstrm; +} + +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + mutex_unlock(&comp->strm_lock); +} + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len) +{ + return comp->backend->compress(src, zstrm->buffer, dst_len, + zstrm->private); +} + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst) +{ + return comp->backend->decompress(src, src_len, dst); +} + +void zcomp_destroy(struct zcomp *comp) +{ + zcomp_strm_free(comp, comp->zstrm); + kfree(comp); +} + +/* + * search available compressors for requested algorithm. + * allocate new zcomp and initialize it. return NULL + * if requested algorithm is not supported or in case + * of init error + */ +struct zcomp *zcomp_create(const char *compress) +{ + struct zcomp *comp; + struct zcomp_backend *backend; + + backend = find_backend(compress); + if (!backend) + return NULL; + + comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); + if (!comp) + return NULL; + + comp->backend = backend; + mutex_init(&comp->strm_lock); + + comp->zstrm = zcomp_strm_alloc(comp); + if (!comp->zstrm) { + kfree(comp); + return NULL; + } + return comp; +} diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h new file mode 100644 index 000000000000..c9a98e1317fe --- /dev/null +++ b/drivers/block/zram/zcomp.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_H_ +#define _ZCOMP_H_ + +#include + +struct zcomp_strm { + /* compression/decompression buffer */ + void *buffer; + /* + * The private data of the compression stream, only compression + * stream backend can touch this (e.g. compression algorithm + * working memory) + */ + void *private; +}; + +/* static compression backend */ +struct zcomp_backend { + int (*compress)(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private); + + int (*decompress)(const unsigned char *src, size_t src_len, + unsigned char *dst); + + void *(*create)(void); + void (*destroy)(void *private); + + const char *name; +}; + +/* dynamic per-device compression frontend */ +struct zcomp { + struct mutex strm_lock; + struct zcomp_strm *zstrm; + struct zcomp_backend *backend; +}; + +struct zcomp *zcomp_create(const char *comp); +void zcomp_destroy(struct zcomp *comp); + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len); + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst); +#endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c new file mode 100644 index 000000000000..da1bc47d588e --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lzo.h" + +static void *lzo_create(void) +{ + return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); +} + +static void lzo_destroy(void *private) +{ + kfree(private); +} + +static int lzo_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +struct zcomp_backend zcomp_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create = lzo_create, + .destroy = lzo_destroy, + .name = "lzo", +}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h new file mode 100644 index 000000000000..128c5807fa14 --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZO_H_ +#define _ZCOMP_LZO_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lzo; + +#endif /* _ZCOMP_LZO_H_ */ -- cgit v1.2.3 From b7ca232ee7e85ed3b18e39eb20a7f458ee1d6047 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:12 -0700 Subject: zram: use zcomp compressing backends Do not perform direct LZO compress/decompress calls, initialise and use zcomp LZO backend (single compression stream) instead. [akpm@linux-foundation.org: resolve conflicts with zram-delete-zram_init_device-fix.patch] Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/Makefile | 2 +- drivers/block/zram/zram_drv.c | 69 +++++++++++++++++++------------------------ drivers/block/zram/zram_drv.h | 8 ++--- 3 files changed, 36 insertions(+), 43 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index cb0f9ced6a93..757c6a5cadff 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,3 +1,3 @@ -zram-y := zram_drv.o +zram-y := zcomp_lzo.o zcomp.o zram_drv.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 76ba67673a90..98823f9ca8b1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -38,6 +37,7 @@ /* Globals */ static int zram_major; static struct zram *zram_devices; +static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; @@ -160,8 +160,6 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio) static void zram_meta_free(struct zram_meta *meta) { zs_destroy_pool(meta->mem_pool); - kfree(meta->compress_workmem); - free_pages((unsigned long)meta->compress_buffer, 1); vfree(meta->table); kfree(meta); } @@ -173,22 +171,11 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) if (!meta) goto out; - meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - if (!meta->compress_workmem) - goto free_meta; - - meta->compress_buffer = - (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (!meta->compress_buffer) { - pr_err("Error allocating compressor buffer space\n"); - goto free_workmem; - } - num_pages = disksize >> PAGE_SHIFT; meta->table = vzalloc(num_pages * sizeof(*meta->table)); if (!meta->table) { pr_err("Error allocating zram address table\n"); - goto free_buffer; + goto free_meta; } meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); @@ -198,15 +185,10 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) } rwlock_init(&meta->tb_lock); - mutex_init(&meta->buffer_lock); return meta; free_table: vfree(meta->table); -free_buffer: - free_pages((unsigned long)meta->compress_buffer, 1); -free_workmem: - kfree(meta->compress_workmem); free_meta: kfree(meta); meta = NULL; @@ -280,8 +262,7 @@ static void zram_free_page(struct zram *zram, size_t index) static int zram_decompress_page(struct zram *zram, char *mem, u32 index) { - int ret = LZO_E_OK; - size_t clen = PAGE_SIZE; + int ret = 0; unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; @@ -301,12 +282,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) if (size == PAGE_SIZE) copy_page(mem, cmem); else - ret = lzo1x_decompress_safe(cmem, size, mem, &clen); + ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); read_unlock(&meta->tb_lock); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); atomic64_inc(&zram->stats.failed_reads); return ret; @@ -349,7 +330,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, ret = zram_decompress_page(zram, uncmem, index); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) + if (unlikely(ret)) goto out_cleanup; if (is_partial_io(bvec)) @@ -374,11 +355,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; + struct zcomp_strm *zstrm; bool locked = false; page = bvec->bv_page; - src = meta->compress_buffer; - if (is_partial_io(bvec)) { /* * This is a partial IO. We need to read the full page @@ -394,7 +374,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - mutex_lock(&meta->buffer_lock); + zstrm = zcomp_strm_find(zram->comp); locked = true; user_mem = kmap_atomic(page); @@ -420,22 +400,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, - meta->compress_workmem); + ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; uncmem = NULL; } - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); goto out; } - + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { clen = PAGE_SIZE; - src = NULL; if (is_partial_io(bvec)) src = uncmem; } @@ -457,6 +435,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, memcpy(cmem, src, clen); } + zcomp_strm_release(zram->comp, zstrm); + locked = false; zs_unmap_object(meta->mem_pool, handle); /* @@ -475,10 +455,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic64_inc(&zram->stats.pages_stored); out: if (locked) - mutex_unlock(&meta->buffer_lock); + zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); - if (ret) atomic64_inc(&zram->stats.failed_writes); return ret; @@ -522,6 +501,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) zs_free(meta->mem_pool, handle); } + zcomp_destroy(zram->comp); zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -539,6 +519,7 @@ static ssize_t disksize_store(struct device *dev, u64 disksize; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); + int err; disksize = memparse(buf, NULL); if (!disksize) @@ -551,10 +532,17 @@ static ssize_t disksize_store(struct device *dev, down_write(&zram->init_lock); if (init_done(zram)) { - zram_meta_free(meta); - up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; + err = -EBUSY; + goto out_free_meta; + } + + zram->comp = zcomp_create(default_compressor); + if (!zram->comp) { + pr_info("Cannot initialise %s compressing backend\n", + default_compressor); + err = -EINVAL; + goto out_free_meta; } zram->meta = meta; @@ -563,6 +551,11 @@ static ssize_t disksize_store(struct device *dev, up_write(&zram->init_lock); return len; + +out_free_meta: + up_write(&zram->init_lock); + zram_meta_free(meta); + return err; } static ssize_t reset_store(struct device *dev, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 1d5b1f5786a8..45e04f7b713f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -16,9 +16,10 @@ #define _ZRAM_DRV_H_ #include -#include #include +#include "zcomp.h" + /* * Some arbitrary value. This is just to catch * invalid value for num_devices module parameter. @@ -81,17 +82,16 @@ struct zram_stats { struct zram_meta { rwlock_t tb_lock; /* protect table */ - void *compress_workmem; - void *compress_buffer; struct table *table; struct zs_pool *mem_pool; - struct mutex buffer_lock; /* protect compress buffers */ }; struct zram { struct zram_meta *meta; struct request_queue *queue; struct gendisk *disk; + struct zcomp *comp; + /* Prevent concurrent execution of device init, reset and R/W request */ struct rw_semaphore init_lock; /* -- cgit v1.2.3 From 9cc97529a180b369fcb7e5265771b6ba7e01f05b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:13 -0700 Subject: zram: factor out single stream compression This is preparation patch to add multi stream support to zcomp. Introduce struct zcomp_strm_single and a set of functions to manage zcomp_strm stream access. zcomp_strm_single implements single compession stream, same way as current zcomp implementation. This moves zcomp_strm stream control and locking from zcomp, so compressing backend zcomp is not aware of required locking. Single and multi streams require different locking schemes. Minchan Kim reported that spinlock-based locking scheme (which is used in multi stream implementation) has demonstrated a severe perfomance regression for single compression stream case, comparing to mutex-based. see https://lkml.org/lkml/2014/2/18/16 The following set of functions added: - zcomp_strm_single_find()/zcomp_strm_single_release() find and release a compression stream, implement required locking - zcomp_strm_single_create()/zcomp_strm_single_destroy() create and destroy zcomp_strm_single New ->strm_find() and ->strm_release() callbacks added to zcomp, which are set to zcomp_strm_single_find() and zcomp_strm_single_release() during initialisation. Instead of direct locking and zcomp_strm access from zcomp_strm_find() and zcomp_strm_release(), zcomp now calls ->strm_find() and ->strm_release() correspondingly. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 62 ++++++++++++++++++++++++++++++++++++++++------ drivers/block/zram/zcomp.h | 7 ++++-- 2 files changed, 59 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 22f4ae235660..72e8071f9d73 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -16,6 +16,14 @@ #include "zcomp.h" #include "zcomp_lzo.h" +/* + * single zcomp_strm backend + */ +struct zcomp_strm_single { + struct mutex strm_lock; + struct zcomp_strm *zstrm; +}; + static struct zcomp_backend *find_backend(const char *compress) { if (strncmp(compress, "lzo", 3) == 0) @@ -54,15 +62,56 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) return zstrm; } +static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_lock(&zs->strm_lock); + return zs->zstrm; +} + +static void zcomp_strm_single_release(struct zcomp *comp, + struct zcomp_strm *zstrm) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_unlock(&zs->strm_lock); +} + +static void zcomp_strm_single_destroy(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + zcomp_strm_free(comp, zs->zstrm); + kfree(zs); +} + +static int zcomp_strm_single_create(struct zcomp *comp) +{ + struct zcomp_strm_single *zs; + + comp->destroy = zcomp_strm_single_destroy; + comp->strm_find = zcomp_strm_single_find; + comp->strm_release = zcomp_strm_single_release; + zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + mutex_init(&zs->strm_lock); + zs->zstrm = zcomp_strm_alloc(comp); + if (!zs->zstrm) { + kfree(zs); + return -ENOMEM; + } + return 0; +} + struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { - mutex_lock(&comp->strm_lock); - return comp->zstrm; + return comp->strm_find(comp); } void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) { - mutex_unlock(&comp->strm_lock); + comp->strm_release(comp, zstrm); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -80,7 +129,7 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src, void zcomp_destroy(struct zcomp *comp) { - zcomp_strm_free(comp, comp->zstrm); + comp->destroy(comp); kfree(comp); } @@ -104,10 +153,7 @@ struct zcomp *zcomp_create(const char *compress) return NULL; comp->backend = backend; - mutex_init(&comp->strm_lock); - - comp->zstrm = zcomp_strm_alloc(comp); - if (!comp->zstrm) { + if (zcomp_strm_single_create(comp) != 0) { kfree(comp); return NULL; } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index c9a98e1317fe..dc3500d842a3 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -39,9 +39,12 @@ struct zcomp_backend { /* dynamic per-device compression frontend */ struct zcomp { - struct mutex strm_lock; - struct zcomp_strm *zstrm; + void *stream; struct zcomp_backend *backend; + + struct zcomp_strm *(*strm_find)(struct zcomp *comp); + void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + void (*destroy)(struct zcomp *comp); }; struct zcomp *zcomp_create(const char *comp); -- cgit v1.2.3 From beca3ec71fe5490ee9237dc42400f50402baf83e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:14 -0700 Subject: zram: add multi stream functionality Existing zram (zcomp) implementation has only one compression stream (buffer and algorithm private part), so in order to prevent data corruption only one write (compress operation) can use this compression stream, forcing all concurrent write operations to wait for stream lock to be released. This patch changes zcomp to keep a compression streams list of user-defined size (via sysfs device attr). Each write operation still exclusively holds compression stream, the difference is that we can have N write operations (depending on size of streams list) executing in parallel. See TEST section later in commit message for performance data. Introduce struct zcomp_strm_multi and a set of functions to manage zcomp_strm stream access. zcomp_strm_multi has a list of idle zcomp_strm structs, spinlock to protect idle list and wait queue, making it possible to perform parallel compressions. The following set of functions added: - zcomp_strm_multi_find()/zcomp_strm_multi_release() find and release a compression stream, implement required locking - zcomp_strm_multi_create()/zcomp_strm_multi_destroy() create and destroy zcomp_strm_multi zcomp ->strm_find() and ->strm_release() callbacks are set during initialisation to zcomp_strm_multi_find()/zcomp_strm_multi_release() correspondingly. Each time zcomp issues a zcomp_strm_multi_find() call, the following set of operations performed: - spin lock strm_lock - if idle list is not empty, remove zcomp_strm from idle list, spin unlock and return zcomp stream pointer to caller - if idle list is empty, current adds itself to wait queue. it will be awaken by zcomp_strm_multi_release() caller. zcomp_strm_multi_release(): - spin lock strm_lock - add zcomp stream to idle list - spin unlock, wake up sleeper Minchan Kim reported that spinlock-based locking scheme has demonstrated a severe perfomance regression for single compression stream case, comparing to mutex-based (see https://lkml.org/lkml/2014/2/18/16) base spinlock mutex ==Initial write ==Initial write ==Initial write records: 5 records: 5 records: 5 avg: 1642424.35 avg: 699610.40 avg: 1655583.71 std: 39890.95(2.43%) std: 232014.19(33.16%) std: 52293.96 max: 1690170.94 max: 1163473.45 max: 1697164.75 min: 1568669.52 min: 573429.88 min: 1553410.23 ==Rewrite ==Rewrite ==Rewrite records: 5 records: 5 records: 5 avg: 1611775.39 avg: 501406.64 avg: 1684419.11 std: 17144.58(1.06%) std: 15354.41(3.06%) std: 18367.42 max: 1641800.95 max: 531356.78 max: 1706445.84 min: 1593515.27 min: 488817.78 min: 1655335.73 When only one compression stream available, mutex with spin on owner tends to perform much better than frequent wait_event()/wake_up(). This is why single stream implemented as a special case with mutex locking. Introduce and document zram device attribute max_comp_streams. This attr shows and stores current zcomp's max number of zcomp streams (max_strm). Extend zcomp's zcomp_create() with `max_strm' parameter. `max_strm' limits the number of zcomp_strm structs in compression backend's idle list (max_comp_streams). max_comp_streams used during initialisation as follows: -- passing to zcomp_create() max_strm equals to 1 will initialise zcomp using single compression stream zcomp_strm_single (mutex-based locking). -- passing to zcomp_create() max_strm greater than 1 will initialise zcomp using multi compression stream zcomp_strm_multi (spinlock-based locking). default max_comp_streams value is 1, meaning that zram with single stream will be initialised. Later patch will introduce configuration knob to change max_comp_streams on already initialised and used zcomp. TEST iozone -t 3 -R -r 16K -s 60M -I +Z test base 1 strm (mutex) 3 strm (spinlock) ----------------------------------------------------------------------- Initial write 589286.78 583518.39 718011.05 Rewrite 604837.97 596776.38 1515125.72 Random write 584120.11 595714.58 1388850.25 Pwrite 535731.17 541117.38 739295.27 Fwrite 1418083.88 1478612.72 1484927.06 Usage example: set max_comp_streams to 4 echo 4 > /sys/block/zram0/max_comp_streams show current max_comp_streams (default value is 1). cat /sys/block/zram0/max_comp_streams Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 124 +++++++++++++++++++++++++++++++++++++++++- drivers/block/zram/zcomp.h | 4 +- drivers/block/zram/zram_drv.c | 42 +++++++++++++- drivers/block/zram/zram_drv.h | 2 +- 4 files changed, 167 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 72e8071f9d73..c06f75f54718 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -24,6 +24,21 @@ struct zcomp_strm_single { struct zcomp_strm *zstrm; }; +/* + * multi zcomp_strm backend + */ +struct zcomp_strm_multi { + /* protect strm list */ + spinlock_t strm_lock; + /* max possible number of zstrm streams */ + int max_strm; + /* number of available zstrm streams */ + int avail_strm; + /* list of available strms */ + struct list_head idle_strm; + wait_queue_head_t strm_wait; +}; + static struct zcomp_backend *find_backend(const char *compress) { if (strncmp(compress, "lzo", 3) == 0) @@ -62,6 +77,107 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) return zstrm; } +/* + * get idle zcomp_strm or wait until other process release + * (zcomp_strm_release()) one for us + */ +static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (1) { + spin_lock(&zs->strm_lock); + if (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + spin_unlock(&zs->strm_lock); + return zstrm; + } + /* zstrm streams limit reached, wait for idle stream */ + if (zs->avail_strm >= zs->max_strm) { + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + /* allocate new zstrm stream */ + zs->avail_strm++; + spin_unlock(&zs->strm_lock); + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + spin_lock(&zs->strm_lock); + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + break; + } + return zstrm; +} + +/* add stream back to idle list and wake up waiter or free the stream */ +static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + struct zcomp_strm_multi *zs = comp->stream; + + spin_lock(&zs->strm_lock); + if (zs->avail_strm <= zs->max_strm) { + list_add(&zstrm->list, &zs->idle_strm); + spin_unlock(&zs->strm_lock); + wake_up(&zs->strm_wait); + return; + } + + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + zcomp_strm_free(comp, zstrm); +} + +static void zcomp_strm_multi_destroy(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + } + kfree(zs); +} + +static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) +{ + struct zcomp_strm *zstrm; + struct zcomp_strm_multi *zs; + + comp->destroy = zcomp_strm_multi_destroy; + comp->strm_find = zcomp_strm_multi_find; + comp->strm_release = zcomp_strm_multi_release; + zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + spin_lock_init(&zs->strm_lock); + INIT_LIST_HEAD(&zs->idle_strm); + init_waitqueue_head(&zs->strm_wait); + zs->max_strm = max_strm; + zs->avail_strm = 1; + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + kfree(zs); + return -ENOMEM; + } + list_add(&zstrm->list, &zs->idle_strm); + return 0; +} + static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) { struct zcomp_strm_single *zs = comp->stream; @@ -139,7 +255,7 @@ void zcomp_destroy(struct zcomp *comp) * if requested algorithm is not supported or in case * of init error */ -struct zcomp *zcomp_create(const char *compress) +struct zcomp *zcomp_create(const char *compress, int max_strm) { struct zcomp *comp; struct zcomp_backend *backend; @@ -153,7 +269,11 @@ struct zcomp *zcomp_create(const char *compress) return NULL; comp->backend = backend; - if (zcomp_strm_single_create(comp) != 0) { + if (max_strm > 1) + zcomp_strm_multi_create(comp, max_strm); + else + zcomp_strm_single_create(comp); + if (!comp->stream) { kfree(comp); return NULL; } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index dc3500d842a3..2a3684446160 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -21,6 +21,8 @@ struct zcomp_strm { * working memory) */ void *private; + /* used in multi stream backend, protected by backend strm_lock */ + struct list_head list; }; /* static compression backend */ @@ -47,7 +49,7 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; -struct zcomp *zcomp_create(const char *comp); +struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 98823f9ca8b1..bdc7eb8c6df7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -108,6 +108,40 @@ static ssize_t mem_used_total_show(struct device *dev, return sprintf(buf, "%llu\n", val); } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return sprintf(buf, "%d\n", val); +} + +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int num; + struct zram *zram = dev_to_zram(dev); + + if (kstrtoint(buf, 0, &num)) + return -EINVAL; + if (num < 1) + return -EINVAL; + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't set max_comp_streams for initialized device\n"); + return -EBUSY; + } + zram->max_comp_streams = num; + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -502,6 +536,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) } zcomp_destroy(zram->comp); + zram->max_comp_streams = 1; + zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -537,7 +573,7 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor); + zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", default_compressor); @@ -698,6 +734,8 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, + max_comp_streams_show, max_comp_streams_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -722,6 +760,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_orig_data_size.attr, &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, + &dev_attr_max_comp_streams.attr, NULL, }; @@ -784,6 +823,7 @@ static int create_device(struct zram *zram, int device_id) } zram->meta = NULL; + zram->max_comp_streams = 1; return 0; out_free_disk: diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 45e04f7b713f..ccf36d11755a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -99,7 +99,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - + int max_comp_streams; struct zram_stats stats; }; #endif -- cgit v1.2.3 From fe8eb122c82b2049c460fc6df6e8583a2f935cff Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:15 -0700 Subject: zram: add set_max_streams knob This patch allows to change max_comp_streams on initialised zcomp. Introduce zcomp set_max_streams() knob, zcomp_strm_multi_set_max_streams() and zcomp_strm_single_set_max_streams() callbacks to change streams limit for zcomp_strm_multi and zcomp_strm_single, accordingly. set_max_streams for single steam zcomp does nothing. If user has lowered the limit, then zcomp_strm_multi_set_max_streams() attempts to immediately free extra streams (as much as it can, depending on idle streams availability). Note, this patch does not allow to change stream 'policy' from single to multi stream (or vice versa) on already initialised compression backend. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 36 ++++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp.h | 3 +++ drivers/block/zram/zram_drv.c | 5 ++--- 3 files changed, 41 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index c06f75f54718..ac276f79f21c 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -136,6 +136,29 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr zcomp_strm_free(comp, zstrm); } +/* change max_strm limit */ +static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + spin_lock(&zs->strm_lock); + zs->max_strm = num_strm; + /* + * if user has lowered the limit and there are idle streams, + * immediately free as much streams (and memory) as we can. + */ + while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + zs->avail_strm--; + } + spin_unlock(&zs->strm_lock); + return 0; +} + static void zcomp_strm_multi_destroy(struct zcomp *comp) { struct zcomp_strm_multi *zs = comp->stream; @@ -158,6 +181,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) comp->destroy = zcomp_strm_multi_destroy; comp->strm_find = zcomp_strm_multi_find; comp->strm_release = zcomp_strm_multi_release; + comp->set_max_streams = zcomp_strm_multi_set_max_streams; zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); if (!zs) return -ENOMEM; @@ -192,6 +216,12 @@ static void zcomp_strm_single_release(struct zcomp *comp, mutex_unlock(&zs->strm_lock); } +static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +{ + /* zcomp_strm_single support only max_comp_streams == 1 */ + return -ENOTSUPP; +} + static void zcomp_strm_single_destroy(struct zcomp *comp) { struct zcomp_strm_single *zs = comp->stream; @@ -206,6 +236,7 @@ static int zcomp_strm_single_create(struct zcomp *comp) comp->destroy = zcomp_strm_single_destroy; comp->strm_find = zcomp_strm_single_find; comp->strm_release = zcomp_strm_single_release; + comp->set_max_streams = zcomp_strm_single_set_max_streams; zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); if (!zs) return -ENOMEM; @@ -220,6 +251,11 @@ static int zcomp_strm_single_create(struct zcomp *comp) return 0; } +int zcomp_set_max_streams(struct zcomp *comp, int num_strm) +{ + return comp->set_max_streams(comp, num_strm); +} + struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { return comp->strm_find(comp); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 2a3684446160..bd11d59c5dd1 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -46,6 +46,7 @@ struct zcomp { struct zcomp_strm *(*strm_find)(struct zcomp *comp); void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + int (*set_max_streams)(struct zcomp *comp, int num_strm); void (*destroy)(struct zcomp *comp); }; @@ -60,4 +61,6 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp *comp, const unsigned char *src, size_t src_len, unsigned char *dst); + +int zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bdc7eb8c6df7..3a5f24c341dc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -133,9 +133,8 @@ static ssize_t max_comp_streams_store(struct device *dev, return -EINVAL; down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); - pr_info("Can't set max_comp_streams for initialized device\n"); - return -EBUSY; + if (zcomp_set_max_streams(zram->comp, num)) + pr_info("Cannot change max compression streams\n"); } zram->max_comp_streams = num; up_write(&zram->init_lock); -- cgit v1.2.3 From e46b8a030d76d3c94156c545c3f4c3676d813435 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:17 -0700 Subject: zram: make compression algorithm selection possible Add and document `comp_algorithm' device attribute. This attribute allows to show supported compression and currently selected compression algorithms: cat /sys/block/zram0/comp_algorithm [lzo] lz4 and change selected compression algorithm: echo lzo > /sys/block/zram0/comp_algorithm Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 32 +++++++++++++++++++++++++++++--- drivers/block/zram/zcomp.h | 2 ++ drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 1 + 4 files changed, 66 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index ac276f79f21c..aad533a8bc55 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -39,11 +39,20 @@ struct zcomp_strm_multi { wait_queue_head_t strm_wait; }; +static struct zcomp_backend *backends[] = { + &zcomp_lzo, + NULL +}; + static struct zcomp_backend *find_backend(const char *compress) { - if (strncmp(compress, "lzo", 3) == 0) - return &zcomp_lzo; - return NULL; + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; } static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) @@ -251,6 +260,23 @@ static int zcomp_strm_single_create(struct zcomp *comp) return 0; } +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + sz += sprintf(buf + sz, "[%s] ", backends[i]->name); + else + sz += sprintf(buf + sz, "%s ", backends[i]->name); + i++; + } + sz += sprintf(buf + sz, "\n"); + return sz; +} + int zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index bd11d59c5dd1..8b8997f8613b 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -50,6 +50,8 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; +ssize_t zcomp_available_show(const char *comp, char *buf); + struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3a5f24c341dc..15d46f2e158c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -141,6 +141,34 @@ static ssize_t max_comp_streams_store(struct device *dev, return len; } +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + size_t sz; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->compressor, buf); + up_read(&zram->init_lock); + + return sz; +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -572,10 +600,10 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); + zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", - default_compressor); + zram->compressor); err = -EINVAL; goto out_free_meta; } @@ -735,6 +763,8 @@ static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, max_comp_streams_show, max_comp_streams_store); +static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, + comp_algorithm_show, comp_algorithm_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -760,6 +790,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, NULL, }; @@ -820,7 +851,7 @@ static int create_device(struct zram *zram, int device_id) pr_warn("Error creating sysfs group"); goto out_free_disk; } - + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; return 0; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ccf36d11755a..7f21c145e317 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -101,5 +101,6 @@ struct zram { u64 disksize; /* bytes */ int max_comp_streams; struct zram_stats stats; + char compressor[10]; }; #endif -- cgit v1.2.3 From 6e76668e415adf799839f0ab205142ad7002d260 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:18 -0700 Subject: zram: add lz4 algorithm backend Introduce LZ4 compression backend and make it available for selection. LZ4 support is optional and requires user to set ZRAM_LZ4_COMPRESS config option. The default compression backend is LZO. TEST (x86_64, core i5, 2 cores + 2 hyperthreading, zram disk size 1G, ext4 file system, 3 compression streams) iozone -t 3 -R -r 16K -s 60M -I +Z Test LZO LZ4 ---------------------------------------------- Initial write 1642744.62 1317005.09 Rewrite 2498980.88 1800645.16 Read 3957026.38 5877043.75 Re-read 3950997.38 5861847.00 Reverse Read 2937114.56 5047384.00 Stride read 2948163.19 4929587.38 Random read 3292692.69 4880793.62 Mixed workload 1545602.62 3502940.38 Random write 2448039.75 1758786.25 Pwrite 1670051.03 1338329.69 Pread 2530682.00 5097177.62 Fwrite 3232085.62 3275942.56 Fread 6306880.25 6645271.12 So on my system LZ4 is slower in write-only tests, while it performs better in read-only and mixed (reads + writes) tests. Official LZ4 benchmarks available here http://code.google.com/p/lz4/ (linux kernel uses revision r90). Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/Kconfig | 10 +++++++++ drivers/block/zram/Makefile | 2 ++ drivers/block/zram/zcomp.c | 6 ++++++ drivers/block/zram/zcomp_lz4.c | 47 ++++++++++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp_lz4.h | 17 +++++++++++++++ 5 files changed, 82 insertions(+) create mode 100644 drivers/block/zram/zcomp_lz4.c create mode 100644 drivers/block/zram/zcomp_lz4.h (limited to 'drivers/block') diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 3450be850399..6489c0fd0ea6 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -15,6 +15,16 @@ config ZRAM See zram.txt for more information. +config ZRAM_LZ4_COMPRESS + bool "Enable LZ4 algorithm support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default n + help + This option enables LZ4 compression algorithm support. Compression + algorithm can be changed using `comp_algorithm' device attribute. + config ZRAM_DEBUG bool "Compressed RAM block device debug support" depends on ZRAM diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index 757c6a5cadff..be0763ff57a2 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,3 +1,5 @@ zram-y := zcomp_lzo.o zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o + obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index aad533a8bc55..d5919031ca8b 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -15,6 +15,9 @@ #include "zcomp.h" #include "zcomp_lzo.h" +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#include "zcomp_lz4.h" +#endif /* * single zcomp_strm backend @@ -41,6 +44,9 @@ struct zcomp_strm_multi { static struct zcomp_backend *backends[] = { &zcomp_lzo, +#ifdef CONFIG_ZRAM_LZ4_COMPRESS + &zcomp_lz4, +#endif NULL }; diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c new file mode 100644 index 000000000000..f2afb7e988c3 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lz4.h" + +static void *zcomp_lz4_create(void) +{ + return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); +} + +static void zcomp_lz4_destroy(void *private) +{ + kfree(private); +} + +static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + /* return : Success if return 0 */ + return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); +} + +static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + /* return : Success if return 0 */ + return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); +} + +struct zcomp_backend zcomp_lz4 = { + .compress = zcomp_lz4_compress, + .decompress = zcomp_lz4_decompress, + .create = zcomp_lz4_create, + .destroy = zcomp_lz4_destroy, + .name = "lz4", +}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h new file mode 100644 index 000000000000..60613fb29dd8 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZ4_H_ +#define _ZCOMP_LZ4_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lz4; + +#endif /* _ZCOMP_LZ4_H_ */ -- cgit v1.2.3 From d61f98c70e8b0d324e8e83be2ed546d6295e63f3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:19 -0700 Subject: zram: move comp allocation out of init_lock While fixing lockdep spew of ->init_lock reported by Sasha Levin [1], Minchan Kim noted [2] that it's better to move compression backend allocation (using GPF_KERNEL) out of the ->init_lock lock, same way as with zram_meta_alloc(), in order to prevent the same lockdep spew. [1] https://lkml.org/lkml/2014/2/27/337 [2] https://lkml.org/lkml/2014/3/3/32 Signed-off-by: Sergey Senozhatsky Reported-by: Minchan Kim Acked-by: Minchan Kim Cc: Sasha Levin Acked-by: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 15d46f2e158c..e4d536b485e6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -580,9 +580,10 @@ static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u64 disksize; + struct zcomp *comp; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - int err; + int err = -EINVAL; disksize = memparse(buf, NULL); if (!disksize) @@ -593,30 +594,32 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (!comp) { + pr_info("Cannot initialise %s compressing backend\n", + zram->compressor); + goto out_cleanup; + } + down_write(&zram->init_lock); if (init_done(zram)) { + up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); err = -EBUSY; - goto out_free_meta; - } - - zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (!zram->comp) { - pr_info("Cannot initialise %s compressing backend\n", - zram->compressor); - err = -EINVAL; - goto out_free_meta; + goto out_cleanup; } zram->meta = meta; + zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); return len; -out_free_meta: - up_write(&zram->init_lock); +out_cleanup: + if (comp) + zcomp_destroy(comp); zram_meta_free(meta); return err; } -- cgit v1.2.3 From fcfa8d95cacf5cbbe6dee6b8d229fe86142266e0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:20 -0700 Subject: zram: return error-valued pointer from zcomp_create() Instead of returning just NULL, return ERR_PTR from zcomp_create() if compressing backend creation has failed. ERR_PTR(-EINVAL) for unsupported compression algorithm request, ERR_PTR(-ENOMEM) for allocation (zcomp or compression stream) error. Perform IS_ERR() check of returned from zcomp_create() value in disksize_store() and set return code to PTR_ERR(). Change suggested by Jerome Marchand. [akpm@linux-foundation.org: clean up error recovery flow] Signed-off-by: Sergey Senozhatsky Reported-by: Jerome Marchand Cc: Minchan Kim Cc: Nitin Gupta Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 14 ++++++++------ drivers/block/zram/zram_drv.c | 19 ++++++++++--------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index d5919031ca8b..5647d8fe1dc1 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -319,9 +320,10 @@ void zcomp_destroy(struct zcomp *comp) /* * search available compressors for requested algorithm. - * allocate new zcomp and initialize it. return NULL - * if requested algorithm is not supported or in case - * of init error + * allocate new zcomp and initialize it. return compressing + * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) + * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in + * case of allocation error. */ struct zcomp *zcomp_create(const char *compress, int max_strm) { @@ -330,11 +332,11 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) backend = find_backend(compress); if (!backend) - return NULL; + return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) - return NULL; + return ERR_PTR(-ENOMEM); comp->backend = backend; if (max_strm > 1) @@ -343,7 +345,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) zcomp_strm_single_create(comp); if (!comp->stream) { kfree(comp); - return NULL; + return ERR_PTR(-ENOMEM); } return comp; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e4d536b485e6..6b462d27e7d7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "zram_drv.h" @@ -583,7 +584,7 @@ static ssize_t disksize_store(struct device *dev, struct zcomp *comp; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - int err = -EINVAL; + int err; disksize = memparse(buf, NULL); if (!disksize) @@ -595,18 +596,18 @@ static ssize_t disksize_store(struct device *dev, return -ENOMEM; comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (!comp) { + if (IS_ERR(comp)) { pr_info("Cannot initialise %s compressing backend\n", zram->compressor); - goto out_cleanup; + err = PTR_ERR(comp); + goto out_free_meta; } down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); err = -EBUSY; - goto out_cleanup; + goto out_destroy_comp; } zram->meta = meta; @@ -614,12 +615,12 @@ static ssize_t disksize_store(struct device *dev, zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); - return len; -out_cleanup: - if (comp) - zcomp_destroy(comp); +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: zram_meta_free(meta); return err; } -- cgit v1.2.3 From 60a726e33375a1096e85399cfa1327081b4c38be Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 7 Apr 2014 15:38:21 -0700 Subject: zram: propagate error to user When we initialized zcomp with single, we couldn't change max_comp_streams without zram reset but current interface doesn't show any error to user and even it changes max_comp_streams's value without any effect so it would make user very confusing. This patch prevents max_comp_streams's change when zcomp was initialized as single zcomp and emit the error to user(ex, echo). [akpm@linux-foundation.org: don't return with the lock held, per Sergey] [fengguang.wu@intel.com: fix coccinelle warnings] Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Jerome Marchand Acked-by: Sergey Senozhatsky Signed-off-by: Fengguang Wu Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 10 +++++----- drivers/block/zram/zcomp.h | 4 ++-- drivers/block/zram/zram_drv.c | 17 +++++++++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5647d8fe1dc1..b0e7592c44d8 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -153,7 +153,7 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr } /* change max_strm limit */ -static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) { struct zcomp_strm_multi *zs = comp->stream; struct zcomp_strm *zstrm; @@ -172,7 +172,7 @@ static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) zs->avail_strm--; } spin_unlock(&zs->strm_lock); - return 0; + return true; } static void zcomp_strm_multi_destroy(struct zcomp *comp) @@ -232,10 +232,10 @@ static void zcomp_strm_single_release(struct zcomp *comp, mutex_unlock(&zs->strm_lock); } -static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) { /* zcomp_strm_single support only max_comp_streams == 1 */ - return -ENOTSUPP; + return false; } static void zcomp_strm_single_destroy(struct zcomp *comp) @@ -284,7 +284,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf) return sz; } -int zcomp_set_max_streams(struct zcomp *comp, int num_strm) +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 8b8997f8613b..c59d1fca72c0 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -46,7 +46,7 @@ struct zcomp { struct zcomp_strm *(*strm_find)(struct zcomp *comp); void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - int (*set_max_streams)(struct zcomp *comp, int num_strm); + bool (*set_max_streams)(struct zcomp *comp, int num_strm); void (*destroy)(struct zcomp *comp); }; @@ -64,5 +64,5 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp *comp, const unsigned char *src, size_t src_len, unsigned char *dst); -int zcomp_set_max_streams(struct zcomp *comp, int num_strm); +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6b462d27e7d7..80a1cfca1bf0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -127,19 +127,28 @@ static ssize_t max_comp_streams_store(struct device *dev, { int num; struct zram *zram = dev_to_zram(dev); + int ret; - if (kstrtoint(buf, 0, &num)) - return -EINVAL; + ret = kstrtoint(buf, 0, &num); + if (ret < 0) + return ret; if (num < 1) return -EINVAL; + down_write(&zram->init_lock); if (init_done(zram)) { - if (zcomp_set_max_streams(zram->comp, num)) + if (!zcomp_set_max_streams(zram->comp, num)) { pr_info("Cannot change max compression streams\n"); + ret = -EINVAL; + goto out; + } } + zram->max_comp_streams = num; + ret = len; +out: up_write(&zram->init_lock); - return len; + return ret; } static ssize_t comp_algorithm_show(struct device *dev, -- cgit v1.2.3 From 56b4e8cb85827a2ccc4752a2a7148e56b62b7e96 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:22 -0700 Subject: zram: use scnprintf() in attrs show() methods sysfs.txt documentation lists the following requirements: - The buffer will always be PAGE_SIZE bytes in length. On i386, this is 4096. - show() methods should return the number of bytes printed into the buffer. This is the return value of scnprintf(). - show() should always use scnprintf(). Use scnprintf() in show() functions. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 8 +++++--- drivers/block/zram/zram_drv.c | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index b0e7592c44d8..f1ff39a3d1c1 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -275,12 +275,14 @@ ssize_t zcomp_available_show(const char *comp, char *buf) while (backends[i]) { if (sysfs_streq(comp, backends[i]->name)) - sz += sprintf(buf + sz, "[%s] ", backends[i]->name); + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]->name); else - sz += sprintf(buf + sz, "%s ", backends[i]->name); + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]->name); i++; } - sz += sprintf(buf + sz, "\n"); + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); return sz; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 80a1cfca1bf0..15e7b8e64cbb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -48,7 +48,7 @@ static ssize_t zram_attr_##name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ - return sprintf(b, "%llu\n", \ + return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ static struct device_attribute dev_attr_##name = \ @@ -69,7 +69,7 @@ static ssize_t disksize_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", zram->disksize); + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } static ssize_t initstate_show(struct device *dev, @@ -82,7 +82,7 @@ static ssize_t initstate_show(struct device *dev, val = init_done(zram); up_read(&zram->init_lock); - return sprintf(buf, "%u\n", val); + return scnprintf(buf, PAGE_SIZE, "%u\n", val); } static ssize_t orig_data_size_show(struct device *dev, @@ -90,7 +90,7 @@ static ssize_t orig_data_size_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } @@ -106,7 +106,7 @@ static ssize_t mem_used_total_show(struct device *dev, val = zs_get_total_size_bytes(meta->mem_pool); up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static ssize_t max_comp_streams_show(struct device *dev, @@ -119,7 +119,7 @@ static ssize_t max_comp_streams_show(struct device *dev, val = zram->max_comp_streams; up_read(&zram->init_lock); - return sprintf(buf, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } static ssize_t max_comp_streams_store(struct device *dev, -- cgit v1.2.3 From f4659d8e620d08bd1a84a8aec5d2f5294a242764 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:38:24 -0700 Subject: zram: support REQ_DISCARD zram is ram based block device and can be used by backend of filesystem. When filesystem deletes a file, it normally doesn't do anything on data block of that file. It just marks on metadata of that file. This behavior has no problem on disk based block device, but has problems on ram based block device, since we can't free memory used for data block. To overcome this disadvantage, there is REQ_DISCARD functionality. If block device support REQ_DISCARD and filesystem is mounted with discard option, filesystem sends REQ_DISCARD to block device whenever some data blocks are discarded. All we have to do is to handle this request. This patch implements to flag up QUEUE_FLAG_DISCARD and handle this REQ_DISCARD request. With it, we can free memory used by zram if it isn't used. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Joonsoo Kim Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 15e7b8e64cbb..9849b5233bf4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -551,6 +551,47 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, return ret; } +/* + * zram_bio_discard - handler on discard request + * @index: physical block index in PAGE_SIZE units + * @offset: byte offset within physical block + */ +static void zram_bio_discard(struct zram *zram, u32 index, + int offset, struct bio *bio) +{ + size_t n = bio->bi_iter.bi_size; + + /* + * zram manages data in physical block size units. Because logical block + * size isn't identical with physical block size on some arch, we + * could get a discard request pointing to a specific offset within a + * certain physical block. Although we can handle this request by + * reading that physiclal block and decompressing and partially zeroing + * and re-compressing and then re-storing it, this isn't reasonable + * because our intent with a discard request is to save memory. So + * skipping this logical block is appropriate here. + */ + if (offset) { + if (n < offset) + return; + + n -= offset; + index++; + } + + while (n >= PAGE_SIZE) { + /* + * Discard request can be large so the lock hold times could be + * lengthy. So take the lock once per page. + */ + write_lock(&zram->meta->tb_lock); + zram_free_page(zram, index); + write_unlock(&zram->meta->tb_lock); + index++; + n -= PAGE_SIZE; + } +} + static void zram_reset_device(struct zram *zram, bool reset_capacity) { size_t index; @@ -686,6 +727,12 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio, 0); + return; + } + bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -855,6 +902,21 @@ static int create_device(struct zram *zram, int device_id) ZRAM_LOGICAL_BLOCK_SIZE); blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + zram->disk->queue->limits.discard_granularity = PAGE_SIZE; + zram->disk->queue->limits.max_discard_sectors = UINT_MAX; + /* + * zram_bio_discard() will clear all logical blocks if logical block + * size is identical with physical block size(PAGE_SIZE). But if it is + * different, we will skip discarding some parts of logical blocks in + * the part of the request range which isn't aligned to physical block + * size. So we can't ensure that all discarded logical blocks are + * zeroed. + */ + if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) + zram->disk->queue->limits.discard_zeroes_data = 1; + else + zram->disk->queue->limits.discard_zeroes_data = 0; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); add_disk(zram->disk); -- cgit v1.2.3 From 44bd70c347c466616e430b044c49d48fac29789d Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 8 Apr 2014 13:43:52 -0700 Subject: drivers/block/loop.c: ratelimit error messages Metric tons of high speed spew is not helpful when things go pear shaped. systemd lost its mind, forgot how to stop services it insists on being sole manager of, massive printk() flood ensued, box eventually died. [16206.684000] loop: Write error at byte offset 11412291584, length 4096. [16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost. [16206.684000] loop: Write error at byte offset 13155434496, length 4096. [16206.684000] loop: Write error at byte offset 13155438592, length 4096. [16206.684000] loop: Write error at byte offset 13155442688, length 4096. [16206.684000] loop: Write error at byte offset 13960736768, length 4096. [16206.684000] loop: Write error at byte offset 14229172224, length 4096. [16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost. [16206.684000] loop: Write error at byte offset 14766043136, length 4096. [16206.684000] loop: Write error at byte offset 15034478592, length 4096. [16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost. Signed-off-by: Mike Galbraith Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 66e8c3b94ef3..f70a230a2945 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -237,7 +237,7 @@ static int __do_lo_send_write(struct file *file, file_end_write(file); if (likely(bw == len)) return 0; - printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", + printk_ratelimited(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", (unsigned long long)pos, len); if (bw >= 0) bw = -EIO; @@ -277,7 +277,7 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, return __do_lo_send_write(lo->lo_backing_file, page_address(page), bvec->bv_len, pos); - printk(KERN_ERR "loop: Transfer error at byte offset %llu, " + printk_ratelimited(KERN_ERR "loop: Transfer error at byte offset %llu, " "length %i.\n", (unsigned long long)pos, bvec->bv_len); if (ret > 0) ret = -EIO; @@ -316,7 +316,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) out: return ret; fail: - printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n"); + printk_ratelimited(KERN_ERR "loop: Failed to allocate temporary page for write.\n"); ret = -ENOMEM; goto out; } @@ -345,7 +345,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size = p->bsize; if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) { - printk(KERN_ERR "loop: transfer error block %ld\n", + printk_ratelimited(KERN_ERR "loop: transfer error block %ld\n", page->index); size = -EINVAL; } -- cgit v1.2.3 From 42f614201e80ff4cfb8b285d7190149a8e1e6cec Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Mar 2014 10:46:25 -0600 Subject: NVMe: per-cpu io queues The device's IO queues are associated with CPUs, so we can use a per-cpu variable to map the a qid to a cpu. This provides a convienient way to optimally assign queues to multiple cpus when the device supports fewer queues than the host has cpus. The previous implementation may have assigned these poorly in these situations. This patch addresses this by sharing queues among cpus that are "close" together and should have a lower lock contention penalty. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 204 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 167 insertions(+), 37 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e9495f0bfad3..48d7bd55207a 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -96,6 +98,7 @@ struct nvme_queue { u8 cq_phase; u8 cqe_seen; u8 q_suspended; + cpumask_var_t cpu_mask; struct async_cmd_info cmdinfo; unsigned long cmdid_data[]; }; @@ -270,14 +273,15 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) { + unsigned queue_id = get_cpu_var(*dev->io_queue); rcu_read_lock(); - return rcu_dereference(dev->queues[get_cpu() + 1]); + return rcu_dereference(dev->queues[queue_id]); } static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) { - put_cpu(); rcu_read_unlock(); + put_cpu_var(nvmeq->dev->io_queue); } static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) @@ -1121,6 +1125,8 @@ static void nvme_free_queue(struct rcu_head *r) (void *)nvmeq->cqes, nvmeq->cq_dma_addr); dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); + if (nvmeq->qid) + free_cpumask_var(nvmeq->cpu_mask); kfree(nvmeq); } @@ -1128,8 +1134,6 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) { int i; - for (i = num_possible_cpus(); i > dev->queue_count - 1; i--) - rcu_assign_pointer(dev->queues[i], NULL); for (i = dev->queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = raw_nvmeq(dev, i); rcu_assign_pointer(dev->queues[i], NULL); @@ -1154,6 +1158,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) return 1; } nvmeq->q_suspended = 1; + nvmeq->dev->online_queues--; spin_unlock_irq(&nvmeq->q_lock); irq_set_affinity_hint(vector, NULL); @@ -1208,6 +1213,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, if (!nvmeq->sq_cmds) goto free_cqdma; + if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL)) + goto free_sqdma; + nvmeq->q_dmadev = dmadev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", @@ -1228,6 +1236,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, return nvmeq; + free_sqdma: + dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds, + nvmeq->sq_dma_addr); free_cqdma: dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); @@ -1260,6 +1271,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); nvme_cancel_ios(nvmeq, false); nvmeq->q_suspended = 0; + dev->online_queues++; } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) @@ -1835,6 +1847,143 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, return NULL; } +static int nvme_find_closest_node(int node) +{ + int n, val, min_val = INT_MAX, best_node = node; + + for_each_online_node(n) { + if (n == node) + continue; + val = node_distance(node, n); + if (val < min_val) { + min_val = val; + best_node = n; + } + } + return best_node; +} + +static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq, + int count) +{ + int cpu; + for_each_cpu(cpu, qmask) { + if (cpumask_weight(nvmeq->cpu_mask) >= count) + break; + if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask)) + *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid; + } +} + +static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, + const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue) +{ + int next_cpu; + for_each_cpu(next_cpu, new_mask) { + cpumask_or(mask, mask, get_cpu_mask(next_cpu)); + cpumask_or(mask, mask, topology_thread_cpumask(next_cpu)); + cpumask_and(mask, mask, unassigned_cpus); + nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue); + } +} + +static void nvme_create_io_queues(struct nvme_dev *dev) +{ + unsigned i, max; + + max = min(dev->max_qid, num_online_cpus()); + for (i = dev->queue_count; i <= max; i++) + if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) + break; + + max = min(dev->queue_count - 1, num_online_cpus()); + for (i = dev->online_queues; i <= max; i++) + if (nvme_create_queue(raw_nvmeq(dev, i), i)) + break; +} + +/* + * If there are fewer queues than online cpus, this will try to optimally + * assign a queue to multiple cpus by grouping cpus that are "close" together: + * thread siblings, core, socket, closest node, then whatever else is + * available. + */ +static void nvme_assign_io_queues(struct nvme_dev *dev) +{ + unsigned cpu, cpus_per_queue, queues, remainder, i; + cpumask_var_t unassigned_cpus; + + nvme_create_io_queues(dev); + + queues = min(dev->online_queues - 1, num_online_cpus()); + if (!queues) + return; + + cpus_per_queue = num_online_cpus() / queues; + remainder = queues - (num_online_cpus() - queues * cpus_per_queue); + + if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL)) + return; + + cpumask_copy(unassigned_cpus, cpu_online_mask); + cpu = cpumask_first(unassigned_cpus); + for (i = 1; i <= queues; i++) { + struct nvme_queue *nvmeq = lock_nvmeq(dev, i); + cpumask_t mask; + + cpumask_clear(nvmeq->cpu_mask); + if (!cpumask_weight(unassigned_cpus)) { + unlock_nvmeq(nvmeq); + break; + } + + mask = *get_cpu_mask(cpu); + nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue); + if (cpus_weight(mask) < cpus_per_queue) + nvme_add_cpus(&mask, unassigned_cpus, + topology_thread_cpumask(cpu), + nvmeq, cpus_per_queue); + if (cpus_weight(mask) < cpus_per_queue) + nvme_add_cpus(&mask, unassigned_cpus, + topology_core_cpumask(cpu), + nvmeq, cpus_per_queue); + if (cpus_weight(mask) < cpus_per_queue) + nvme_add_cpus(&mask, unassigned_cpus, + cpumask_of_node(cpu_to_node(cpu)), + nvmeq, cpus_per_queue); + if (cpus_weight(mask) < cpus_per_queue) + nvme_add_cpus(&mask, unassigned_cpus, + cpumask_of_node( + nvme_find_closest_node( + cpu_to_node(cpu))), + nvmeq, cpus_per_queue); + if (cpus_weight(mask) < cpus_per_queue) + nvme_add_cpus(&mask, unassigned_cpus, + unassigned_cpus, + nvmeq, cpus_per_queue); + + WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue, + "nvme%d qid:%d mis-matched queue-to-cpu assignment\n", + dev->instance, i); + + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + nvmeq->cpu_mask); + cpumask_andnot(unassigned_cpus, unassigned_cpus, + nvmeq->cpu_mask); + cpu = cpumask_next(cpu, unassigned_cpus); + if (remainder && !--remainder) + cpus_per_queue++; + unlock_nvmeq(nvmeq); + } + WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n", + dev->instance); + i = 0; + cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); + for_each_cpu(cpu, unassigned_cpus) + *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1; + free_cpumask_var(unassigned_cpus); +} + static int set_queue_count(struct nvme_dev *dev, int count) { int status; @@ -1857,9 +2006,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = raw_nvmeq(dev, 0); struct pci_dev *pdev = dev->pci_dev; - int result, cpu, i, vecs, nr_io_queues, size, q_depth; + int result, i, vecs, nr_io_queues, size; - nr_io_queues = num_online_cpus(); + nr_io_queues = num_possible_cpus(); result = set_queue_count(dev, nr_io_queues); if (result < 0) return result; @@ -1919,6 +2068,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * number of interrupts. */ nr_io_queues = vecs; + dev->max_qid = nr_io_queues; result = queue_request_irq(dev, adminq, adminq->irqname); if (result) { @@ -1927,36 +2077,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) } /* Free previously allocated queues that are no longer usable */ - nvme_free_queues(dev, nr_io_queues); - - cpu = cpumask_first(cpu_online_mask); - for (i = 0; i < nr_io_queues; i++) { - irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); - cpu = cpumask_next(cpu, cpu_online_mask); - } - - q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, - NVME_Q_DEPTH); - for (i = dev->queue_count - 1; i < nr_io_queues; i++) { - if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) { - result = -ENOMEM; - goto free_queues; - } - } - - for (; i < num_possible_cpus(); i++) { - int target = i % rounddown_pow_of_two(dev->queue_count - 1); - rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]); - } - - for (i = 1; i < dev->queue_count; i++) { - result = nvme_create_queue(raw_nvmeq(dev, i), i); - if (result) { - for (--i; i > 0; i--) - nvme_disable_queue(dev, i); - goto free_queues; - } - } + nvme_free_queues(dev, nr_io_queues + 1); + nvme_assign_io_queues(dev); return 0; @@ -2035,6 +2157,7 @@ static int nvme_dev_add(struct nvme_dev *dev) static int nvme_dev_map(struct nvme_dev *dev) { + u64 cap; int bars, result = -ENOMEM; struct pci_dev *pdev = dev->pci_dev; @@ -2058,7 +2181,9 @@ static int nvme_dev_map(struct nvme_dev *dev) result = -ENODEV; goto unmap; } - dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap)); + cap = readq(&dev->bar->cap); + dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); + dev->db_stride = 1 << NVME_CAP_STRIDE(cap); dev->dbs = ((void __iomem *)dev->bar) + 4096; return 0; @@ -2332,6 +2457,7 @@ static void nvme_free_dev(struct kref *kref) struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); nvme_free_namespaces(dev); + free_percpu(dev->io_queue); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2477,6 +2603,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) GFP_KERNEL); if (!dev->queues) goto free; + dev->io_queue = alloc_percpu(unsigned short); + if (!dev->io_queue) + goto free; INIT_LIST_HEAD(&dev->namespaces); INIT_WORK(&dev->reset_work, nvme_reset_failed_dev); @@ -2526,6 +2655,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) release: nvme_release_instance(dev); free: + free_percpu(dev->io_queue); kfree(dev->queues); kfree(dev->entry); kfree(dev); -- cgit v1.2.3 From 33b1e95c90447ea73e37e837ea0268a894919f19 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Mar 2014 10:46:26 -0600 Subject: NVMe: CPU hot plug notification Registers with hot cpu notification to rebalance, and potentially allocate additional, io queues. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 48d7bd55207a..ce5a4f1a3950 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2002,6 +2002,19 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } +static int nvme_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct nvme_dev *dev = container_of(self, struct nvme_dev, nb); + switch (action) { + case CPU_ONLINE: + case CPU_DEAD: + nvme_assign_io_queues(dev); + break; + } + return NOTIFY_OK; +} + static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = raw_nvmeq(dev, 0); @@ -2080,6 +2093,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_free_queues(dev, nr_io_queues + 1); nvme_assign_io_queues(dev); + dev->nb.notifier_call = &nvme_cpu_notify; + result = register_hotcpu_notifier(&dev->nb); + if (result) + goto free_queues; + return 0; free_queues: @@ -2357,6 +2375,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) int i; dev->initialized = 0; + unregister_hotcpu_notifier(&dev->nb); spin_lock(&dev_list_lock); list_del_init(&dev->node); -- cgit v1.2.3 From b355084a891985d4cd0ca23b1a83366af2c4232d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 4 Apr 2014 11:43:36 -0600 Subject: NVMe: Make I/O timeout a module parameter Increase the default timeout to 30 seconds to match SCSI. Signed-off-by: Keith Busch [use byte instead of ushort] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index ce5a4f1a3950..7c57b1d955a1 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -50,6 +50,10 @@ #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (60 * HZ) +unsigned char io_timeout = 30; +module_param(io_timeout, byte, 0644); +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); + static int nvme_major; module_param(nvme_major, int, 0); -- cgit v1.2.3 From b9afca3efb18a9b8392cb544a3e29e8b1168400c Mon Sep 17 00:00:00 2001 From: Dan McLeran Date: Mon, 7 Apr 2014 17:10:11 -0600 Subject: NVMe: Start-stop nvme_thread during device add-remove. Done to ensure nvme_thread is not running when there are no devices to poll. Signed-off-by: Dan McLeran Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 56 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 14 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7c57b1d955a1..2d69bfec95a4 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -64,6 +64,7 @@ static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; +static wait_queue_head_t nvme_kthread_wait; static void nvme_reset_failed_dev(struct work_struct *ws); @@ -2374,6 +2375,26 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) kthread_stop(kworker_task); } +/* +* Remove the node from the device list and check +* for whether or not we need to stop the nvme_thread. +*/ +static void nvme_dev_list_remove(struct nvme_dev *dev) +{ + struct task_struct *tmp = NULL; + + spin_lock(&dev_list_lock); + list_del_init(&dev->node); + if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { + tmp = nvme_thread; + nvme_thread = NULL; + } + spin_unlock(&dev_list_lock); + + if (tmp) + kthread_stop(tmp); +} + static void nvme_dev_shutdown(struct nvme_dev *dev) { int i; @@ -2381,9 +2402,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) dev->initialized = 0; unregister_hotcpu_notifier(&dev->nb); - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - spin_unlock(&dev_list_lock); + nvme_dev_list_remove(dev); if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { for (i = dev->queue_count - 1; i >= 0; i--) { @@ -2524,6 +2543,7 @@ static const struct file_operations nvme_dev_fops = { static int nvme_dev_start(struct nvme_dev *dev) { int result; + bool start_thread = false; result = nvme_dev_map(dev); if (result) @@ -2534,9 +2554,24 @@ static int nvme_dev_start(struct nvme_dev *dev) goto unmap; spin_lock(&dev_list_lock); + if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { + start_thread = true; + nvme_thread = NULL; + } list_add(&dev->node, &dev_list); spin_unlock(&dev_list_lock); + if (start_thread) { + nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); + wake_up(&nvme_kthread_wait); + } else + wait_event_killable(nvme_kthread_wait, nvme_thread); + + if (IS_ERR_OR_NULL(nvme_thread)) { + result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; + goto disable; + } + result = nvme_setup_io_queues(dev); if (result && result != -EBUSY) goto disable; @@ -2545,9 +2580,7 @@ static int nvme_dev_start(struct nvme_dev *dev) disable: nvme_disable_queue(dev, 0); - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - spin_unlock(&dev_list_lock); + nvme_dev_list_remove(dev); unmap: nvme_dev_unmap(dev); return result; @@ -2776,14 +2809,11 @@ static int __init nvme_init(void) { int result; - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - if (IS_ERR(nvme_thread)) - return PTR_ERR(nvme_thread); + init_waitqueue_head(&nvme_kthread_wait); - result = -ENOMEM; nvme_workq = create_singlethread_workqueue("nvme"); if (!nvme_workq) - goto kill_kthread; + return -ENOMEM; result = register_blkdev(nvme_major, "nvme"); if (result < 0) @@ -2800,8 +2830,6 @@ static int __init nvme_init(void) unregister_blkdev(nvme_major, "nvme"); kill_workq: destroy_workqueue(nvme_workq); - kill_kthread: - kthread_stop(nvme_thread); return result; } @@ -2810,7 +2838,7 @@ static void __exit nvme_exit(void) pci_unregister_driver(&nvme_driver); unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); - kthread_stop(nvme_thread); + BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); } MODULE_AUTHOR("Matthew Wilcox "); -- cgit v1.2.3 From 4cc09e2dc4cbe6009c935b6f12a8376f09124bc5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 2 Apr 2014 15:45:37 -0600 Subject: NVMe: Add getgeo to block ops Some programs require HDIO_GETGEO work, which requires we implement getgeo. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2d69bfec95a4..596e2abd7971 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1714,12 +1715,22 @@ static void nvme_release(struct gendisk *disk, fmode_t mode) kref_put(&dev->kref, nvme_free_dev); } +static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, + .getgeo = nvme_getgeo, }; static void nvme_resubmit_bios(struct nvme_queue *nvmeq) -- cgit v1.2.3 From edd10d33283899fb15d99a290dcc9ceb3604ca78 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 3 Apr 2014 16:45:23 -0600 Subject: NVMe: Retry failed commands with non-fatal errors For commands returned with failed status, queue these for resubmission and continue retrying them until success or for a limited amount of time. The final timeout was arbitrarily chosen so requests can't be retried indefinitely. Since these are requeued on the nvmeq that submitted the command, the callbacks have to take an nvmeq instead of an nvme_dev as a parameter so that we can use the locked queue to append the iod to retry later. The nvme_iod conviently can be used to track how long we've been trying to successfully complete an iod request. The nvme_iod also provides the nvme prp dma mappings, so I had to move a few things around so we can keep those mappings. Signed-off-by: Keith Busch [fixed checkpatch issue with long line] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 235 ++++++++++++++++++++++++++++------------------ drivers/block/nvme-scsi.c | 10 +- 2 files changed, 151 insertions(+), 94 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 596e2abd7971..efa9c8f4a7a7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -50,6 +50,7 @@ #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (60 * HZ) +#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT) unsigned char io_timeout = 30; module_param(io_timeout, byte, 0644); @@ -94,6 +95,7 @@ struct nvme_queue { wait_queue_head_t sq_full; wait_queue_t sq_cong_wait; struct bio_list sq_cong; + struct list_head iod_bio; u32 __iomem *q_db; u16 q_depth; u16 cq_vector; @@ -128,7 +130,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); } -typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, +typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, struct nvme_completion *); struct nvme_cmd_info { @@ -200,7 +202,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) #define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) -static void special_completion(struct nvme_dev *dev, void *ctx, +static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) @@ -208,26 +210,26 @@ static void special_completion(struct nvme_dev *dev, void *ctx, if (ctx == CMD_CTX_FLUSH) return; if (ctx == CMD_CTX_ABORT) { - ++dev->abort_limit; + ++nvmeq->dev->abort_limit; return; } if (ctx == CMD_CTX_COMPLETED) { - dev_warn(&dev->pci_dev->dev, + dev_warn(nvmeq->q_dmadev, "completed id %d twice on queue %d\n", cqe->command_id, le16_to_cpup(&cqe->sq_id)); return; } if (ctx == CMD_CTX_INVALID) { - dev_warn(&dev->pci_dev->dev, + dev_warn(nvmeq->q_dmadev, "invalid id %d completed on queue %d\n", cqe->command_id, le16_to_cpup(&cqe->sq_id)); return; } - dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); + dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); } -static void async_completion(struct nvme_dev *dev, void *ctx, +static void async_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct async_cmd_info *cmdinfo = ctx; @@ -357,6 +359,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) iod->npages = -1; iod->length = nbytes; iod->nents = 0; + iod->first_dma = 0ULL; iod->start_time = jiffies; } @@ -405,19 +408,31 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) part_stat_unlock(); } -static void bio_completion(struct nvme_dev *dev, void *ctx, +static void bio_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; + if (unlikely(status)) { + if (!(status & NVME_SC_DNR || + bio->bi_rw & REQ_FAILFAST_MASK) && + (jiffies - iod->start_time) < IOD_TIMEOUT) { + if (!waitqueue_active(&nvmeq->sq_full)) + add_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); + list_add_tail(&iod->node, &nvmeq->iod_bio); + wake_up(&nvmeq->sq_full); + return; + } + } if (iod->nents) { - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_end_io_acct(bio, iod->start_time); } - nvme_free_iod(dev, iod); + nvme_free_iod(nvmeq->dev, iod); if (status) bio_endio(bio, -EIO); else @@ -425,8 +440,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, } /* length is in bytes. gfp flags indicates whether we may sleep. */ -int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, - struct nvme_iod *iod, int total_len, gfp_t gfp) +int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, + gfp_t gfp) { struct dma_pool *pool; int length = total_len; @@ -439,7 +454,6 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, dma_addr_t prp_dma; int nprps, i; - cmd->prp1 = cpu_to_le64(dma_addr); length -= (PAGE_SIZE - offset); if (length <= 0) return total_len; @@ -454,7 +468,7 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, } if (length <= PAGE_SIZE) { - cmd->prp2 = cpu_to_le64(dma_addr); + iod->first_dma = dma_addr; return total_len; } @@ -469,13 +483,12 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, prp_list = dma_pool_alloc(pool, gfp, &prp_dma); if (!prp_list) { - cmd->prp2 = cpu_to_le64(dma_addr); + iod->first_dma = dma_addr; iod->npages = -1; return (total_len - length) + PAGE_SIZE; } list[0] = prp_list; iod->first_dma = prp_dma; - cmd->prp2 = cpu_to_le64(prp_dma); i = 0; for (;;) { if (i == PAGE_SIZE / 8) { @@ -514,10 +527,11 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, bio_chain(split, bio); - if (bio_list_empty(&nvmeq->sq_cong)) + if (!waitqueue_active(&nvmeq->sq_full)) add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); bio_list_add(&nvmeq->sq_cong, split); bio_list_add(&nvmeq->sq_cong, bio); + wake_up(&nvmeq->sq_full); return 0; } @@ -570,25 +584,13 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, return length; } -/* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, struct bio *bio, struct nvme_iod *iod, int cmdid) { - struct nvme_dsm_range *range; + struct nvme_dsm_range *range = + (struct nvme_dsm_range *)iod_list(iod)[0]; struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); - if (!range) - return -ENOMEM; - - iod_list(iod)[0] = (__le64 *)range; - iod->npages = 0; - range->cattr = cpu_to_le32(0); range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); @@ -635,44 +637,22 @@ int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) return nvme_submit_flush(nvmeq, ns, cmdid); } -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) +static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) { + struct bio *bio = iod->private; + struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; struct nvme_command *cmnd; - struct nvme_iod *iod; - enum dma_data_direction dma_dir; - int cmdid, length, result; + int cmdid; u16 control; u32 dsmgmt; - int psegs = bio_phys_segments(ns->queue, bio); - - if ((bio->bi_rw & REQ_FLUSH) && psegs) { - result = nvme_submit_flush_data(nvmeq, ns); - if (result) - return result; - } - - result = -ENOMEM; - iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); - if (!iod) - goto nomem; - iod->private = bio; - result = -EBUSY; cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); if (unlikely(cmdid < 0)) - goto free_iod; + return cmdid; - if (bio->bi_rw & REQ_DISCARD) { - result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if (result) - goto free_cmdid; - return result; - } - if ((bio->bi_rw & REQ_FLUSH) && !psegs) + if (bio->bi_rw & REQ_DISCARD) + return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); + if ((bio->bi_rw & REQ_FLUSH) && !iod->nents) return nvme_submit_flush(nvmeq, ns, cmdid); control = 0; @@ -686,42 +666,85 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - memset(cmnd, 0, sizeof(*cmnd)); - if (bio_data_dir(bio)) { - cmnd->rw.opcode = nvme_cmd_write; - dma_dir = DMA_TO_DEVICE; - } else { - cmnd->rw.opcode = nvme_cmd_read; - dma_dir = DMA_FROM_DEVICE; - } - - result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); - if (result <= 0) - goto free_cmdid; - length = result; + cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; cmnd->rw.command_id = cmdid; cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, - GFP_ATOMIC); + cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); - cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); + cmnd->rw.length = + cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1); cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); - nvme_start_io_acct(bio); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); return 0; +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct bio *bio) +{ + struct nvme_iod *iod; + int psegs = bio_phys_segments(ns->queue, bio); + int result; + + if ((bio->bi_rw & REQ_FLUSH) && psegs) { + result = nvme_submit_flush_data(nvmeq, ns); + if (result) + return result; + } + + iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); + if (!iod) + return -ENOMEM; + + iod->private = bio; + if (bio->bi_rw & REQ_DISCARD) { + void *range; + /* + * We reuse the small pool to allocate the 16-byte range here + * as it is not worth having a special pool for these or + * additional cases to handle freeing the iod. + */ + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, + GFP_ATOMIC, + &iod->first_dma); + if (!range) { + result = -ENOMEM; + goto free_iod; + } + iod_list(iod)[0] = (__le64 *)range; + iod->npages = 0; + } else if (psegs) { + result = nvme_map_bio(nvmeq, iod, bio, + bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + psegs); + if (result <= 0) + goto free_iod; + if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != + result) { + result = -ENOMEM; + goto free_iod; + } + nvme_start_io_acct(bio); + } + if (unlikely(nvme_submit_iod(nvmeq, iod))) { + if (!waitqueue_active(&nvmeq->sq_full)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + list_add_tail(&iod->node, &nvmeq->iod_bio); + } + return 0; - free_cmdid: - free_cmdid(nvmeq, cmdid, NULL); free_iod: nvme_free_iod(nvmeq->dev, iod); - nomem: return result; } @@ -745,7 +768,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) } ctx = free_cmdid(nvmeq, cqe.command_id, &fn); - fn(nvmeq->dev, ctx, &cqe); + fn(nvmeq, ctx, &cqe); } /* If the controller ignores the cq head doorbell and continuously @@ -781,7 +804,7 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) result = nvme_submit_bio_queue(nvmeq, ns, bio); if (unlikely(result)) { - if (bio_list_empty(&nvmeq->sq_cong)) + if (!waitqueue_active(&nvmeq->sq_full)) add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); bio_list_add(&nvmeq->sq_cong, bio); } @@ -825,7 +848,7 @@ struct sync_cmd_info { int status; }; -static void sync_completion(struct nvme_dev *dev, void *ctx, +static void sync_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct sync_cmd_info *cmdinfo = ctx; @@ -1112,7 +1135,7 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, nvmeq->qid); ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq->dev, ctx, &cqe); + fn(nvmeq, ctx, &cqe); } } @@ -1125,6 +1148,17 @@ static void nvme_free_queue(struct rcu_head *r) struct bio *bio = bio_list_pop(&nvmeq->sq_cong); bio_endio(bio, -EIO); } + while (!list_empty(&nvmeq->iod_bio)) { + static struct nvme_completion cqe = { + .status = cpu_to_le16( + (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), + }; + struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, + struct nvme_iod, + node); + list_del(&iod->node); + bio_completion(nvmeq, iod, &cqe); + } spin_unlock_irq(&nvmeq->q_lock); dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), @@ -1232,6 +1266,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, init_waitqueue_head(&nvmeq->sq_full); init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); bio_list_init(&nvmeq->sq_cong); + INIT_LIST_HEAD(&nvmeq->iod_bio); nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; @@ -1565,7 +1600,9 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.metadata = cpu_to_le64(meta_dma_addr); } - length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); + length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); + c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.rw.prp2 = cpu_to_le64(iod->first_dma); if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; @@ -1635,8 +1672,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, length); if (IS_ERR(iod)) return PTR_ERR(iod); - length = nvme_setup_prps(dev, &c.common, iod, length, - GFP_KERNEL); + length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); + c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.common.prp2 = cpu_to_le64(iod->first_dma); } timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : @@ -1733,17 +1771,33 @@ static const struct block_device_operations nvme_fops = { .getgeo = nvme_getgeo, }; +static void nvme_resubmit_iods(struct nvme_queue *nvmeq) +{ + struct nvme_iod *iod, *next; + + list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) { + if (unlikely(nvme_submit_iod(nvmeq, iod))) + break; + list_del(&iod->node); + if (bio_list_empty(&nvmeq->sq_cong) && + list_empty(&nvmeq->iod_bio)) + remove_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); + } +} + static void nvme_resubmit_bios(struct nvme_queue *nvmeq) { while (bio_list_peek(&nvmeq->sq_cong)) { struct bio *bio = bio_list_pop(&nvmeq->sq_cong); struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; - if (bio_list_empty(&nvmeq->sq_cong)) + if (bio_list_empty(&nvmeq->sq_cong) && + list_empty(&nvmeq->iod_bio)) remove_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); if (nvme_submit_bio_queue(nvmeq, ns, bio)) { - if (bio_list_empty(&nvmeq->sq_cong)) + if (!waitqueue_active(&nvmeq->sq_full)) add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); bio_list_add_head(&nvmeq->sq_cong, bio); @@ -1785,6 +1839,7 @@ static int nvme_kthread(void *data) nvme_process_cq(nvmeq); nvme_cancel_ios(nvmeq, true); nvme_resubmit_bios(nvmeq); + nvme_resubmit_iods(nvmeq); unlock: spin_unlock_irq(&nvmeq->q_lock); } diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 111c920c1574..2c3f5be06da1 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1562,13 +1562,14 @@ static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, res = PTR_ERR(iod); goto out; } - length = nvme_setup_prps(dev, &c.common, iod, tot_len, - GFP_KERNEL); + length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL); if (length != tot_len) { res = -ENOMEM; goto out_unmap; } + c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.dlfw.prp2 = cpu_to_le64(iod->first_dma); c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); } else if (opcode == nvme_admin_activate_fw) { @@ -2092,8 +2093,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, res = PTR_ERR(iod); goto out; } - retcode = nvme_setup_prps(dev, &c.common, iod, unit_len, - GFP_KERNEL); + retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL); if (retcode != unit_len) { nvme_unmap_user_pages(dev, (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, @@ -2102,6 +2102,8 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, res = -ENOMEM; goto out; } + c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.rw.prp2 = cpu_to_le64(iod->first_dma); nvme_offset += unit_num_blocks; -- cgit v1.2.3