diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-12 14:30:53 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-12 14:30:53 -0800 |
commit | 8494bcf5b7c4b2416687e233dd34d4c6b6fe5653 (patch) | |
tree | cebb468e170e639ecfd61ddc5ebcba86c21105fa /drivers/block/nvme-core.c | |
parent | 3e12cefbe143b4947171ff92dd50024c4841e291 (diff) | |
parent | b042a3ca949053231950a1b15f31cccca9e305f3 (diff) | |
download | linux-8494bcf5b7c4b2416687e233dd34d4c6b6fe5653.tar.gz linux-8494bcf5b7c4b2416687e233dd34d4c6b6fe5653.tar.bz2 linux-8494bcf5b7c4b2416687e233dd34d4c6b6fe5653.zip |
Merge branch 'for-3.20/drivers' of git://git.kernel.dk/linux-block
Pull block driver changes from Jens Axboe:
"This contains:
- The 4k/partition fixes for brd from Boaz/Matthew.
- A few xen front/back block fixes from David Vrabel and Roger Pau
Monne.
- Floppy changes from Takashi, cleaning the device file creation.
- Switching libata to use the new blk-mq tagging policy, removing
code (and a suboptimal implementation) from libata. This will
throw you a merge conflict, since a bug in the original libata
tagging code was fixed since this code was branched. Trivial.
From Shaohua.
- Conversion of loop to blk-mq, from Ming Lei.
- Cleanup of the io_schedule() handling in bsg from Peter Zijlstra.
He claims it improves on unreadable code, which will cost him a
beer.
- Maintainer update or NDB, now handled by Markus Pargmann.
- NVMe:
- Optimization from me that avoids a kmalloc/kfree per IO for
smaller (<= 8KB) IO. This cuts about 1% of high IOPS CPU
overhead.
- Removal of (now) dead RCU code, a relic from before NVMe was
converted to blk-mq"
* 'for-3.20/drivers' of git://git.kernel.dk/linux-block:
xen-blkback: default to X86_32 ABI on x86
xen-blkfront: fix accounting of reqs when migrating
xen-blkback,xen-blkfront: add myself as maintainer
block: Simplify bsg complete all
floppy: Avoid manual call of device_create_file()
NVMe: avoid kmalloc/kfree for smaller IO
MAINTAINERS: Update NBD maintainer
libata: make sata_sil24 use fifo tag allocator
libata: move sas ata tag allocation to libata-scsi.c
libata: use blk taging
NVMe: within nvme_free_queues(), delete RCU sychro/deferred free
null_blk: suppress invalid partition info
brd: Request from fdisk 4k alignment
brd: Fix all partitions BUGs
axonram: Fix bug in direct_access
loop: add blk-mq.h include
block: loop: don't handle REQ_FUA explicitly
block: loop: introduce lo_discard() and lo_req_flush()
block: loop: say goodby to bio
block: loop: improve performance via blk-mq
Diffstat (limited to 'drivers/block/nvme-core.c')
-rw-r--r-- | drivers/block/nvme-core.c | 128 |
1 files changed, 89 insertions, 39 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d826bf3e62c8..cbdfbbf98392 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -144,8 +144,37 @@ struct nvme_cmd_info { void *ctx; int aborted; struct nvme_queue *nvmeq; + struct nvme_iod iod[0]; }; +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + unsigned int ret = sizeof(struct nvme_cmd_info); + + ret += sizeof(struct nvme_iod); + ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); + ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + + return ret; +} + static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); } +static void *iod_get_private(struct nvme_iod *iod) +{ + return (void *) (iod->private & ~0x1UL); +} + +/* + * If bit 0 is set, the iod is embedded in the request payload. + */ +static bool iod_should_kfree(struct nvme_iod *iod) +{ + return (iod->private & 0x01) == 0; +} + /* Special values must be less than 0x1000 */ #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) @@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod) return ((void *)iod) + iod->offset; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, + unsigned nseg, unsigned long private) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); - return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); + iod->private = private; + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) +__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, + unsigned long priv, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes, dev) + + sizeof(__le64 *) * nvme_npages(bytes, dev) + sizeof(struct scatterlist) * nseg, gfp); - if (iod) { - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; - iod->first_dma = 0ULL; - } + if (iod) + iod_init(iod, bytes, nseg, priv); return iod; } +static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, + gfp_t gfp) +{ + unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : + sizeof(struct nvme_dsm_range); + unsigned long mask = 0; + struct nvme_iod *iod; + + if (rq->nr_phys_segments <= NVME_INT_PAGES && + size <= NVME_INT_BYTES(dev)) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + + iod = cmd->iod; + mask = 0x01; + iod_init(iod, size, rq->nr_phys_segments, + (unsigned long) rq | 0x01); + return iod; + } + + return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, + (unsigned long) rq, gfp); +} + void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = dev->page_size / 8 - 1; @@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); prp_dma = next_prp_dma; } - kfree(iod); + + if (iod_should_kfree(iod)) + kfree(iod); } static int nvme_error_status(u16 status) @@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); u16 status = le16_to_cpup(&cqe->status) >> 1; @@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct nvme_ns *ns) { - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_command *cmnd; u16 control = 0; u32 dsmgmt = 0; @@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = req->nr_phys_segments; enum dma_data_direction dma_dir; - unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : - sizeof(struct nvme_dsm_range); - iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; - iod->private = req; - if (req->cmd_flags & REQ_DISCARD) { void *range; /* @@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; - } else if (psegs) { + } else if (req->nr_phys_segments) { dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - sg_init_table(iod->sg, psegs); + sg_init_table(iod->sg, req->nr_phys_segments); iod->nents = blk_rq_map_sg(req->q, req, iod->sg); if (!iod->nents) goto error_cmd; @@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) static void nvme_free_queues(struct nvme_dev *dev, int lowest) { - LLIST_HEAD(q_list); - struct nvme_queue *nvmeq, *next; - struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = dev->queues[i]; - llist_add(&nvmeq->node, &q_list); dev->queue_count--; dev->queues[i] = NULL; - } - synchronize_rcu(); - entry = llist_del_all(&q_list); - llist_for_each_entry_safe(nvmeq, next, entry, node) nvme_free_queue(nvmeq); + } } /** @@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.cmd_size = nvme_cmd_size(dev); dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); + iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); if (!iod) goto put_pages; @@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.cmd_size = nvme_cmd_size(dev); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; |