diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 |
commit | 0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch) | |
tree | cefccd67dc1f27bb45830f6b8065dd4a1c05e83b /drivers/nvme/host | |
parent | 9697e9da84299d0d715d515dd2cc48f1eceb277d (diff) | |
parent | 796baeeef85a40b3495a907fb7425086e7010102 (diff) | |
download | linux-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.tar.gz linux-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.tar.bz2 linux-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.zip |
Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"This is the main pull request for block IO related changes for the
4.16 kernel. Nothing major in this pull request, but a good amount of
improvements and fixes all over the map. This contains:
- BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
Paolo.
- Support for SMR zones for deadline and mq-deadline from Damien and
Christoph.
- Set of fixes for bcache by way of Michael Lyle, including fixes
from himself, Kent, Rui, Tang, and Coly.
- Series from Matias for lightnvm with fixes from Hans Holmberg,
Javier, and Matias. Mostly centered around pblk, and the removing
rrpc 1.2 in preparation for supporting 2.0.
- A couple of NVMe pull requests from Christoph. Nothing major in
here, just fixes and cleanups, and support for command tracing from
Johannes.
- Support for blk-throttle for tracking reads and writes separately.
From Joseph Qi. A few cleanups/fixes also for blk-throttle from
Weiping.
- Series from Mike Snitzer that enables dm to register its queue more
logically, something that's alwways been problematic on dm since
it's a stacked device.
- Series from Ming cleaning up some of the bio accessor use, in
preparation for supporting multipage bvecs.
- Various fixes from Ming closing up holes around queue mapping and
quiescing.
- BSD partition fix from Richard Narron, fixing a problem where we
can't mount newer (10/11) FreeBSD partitions.
- Series from Tejun reworking blk-mq timeout handling. The previous
scheme relied on atomic bits, but it had races where we would think
a request had timed out if it to reused at the wrong time.
- null_blk now supports faking timeouts, to enable us to better
exercise and test that functionality separately. From me.
- Kill the separate atomic poll bit in the request struct. After
this, we don't use the atomic bits on blk-mq anymore at all. From
me.
- sgl_alloc/free helpers from Bart.
- Heavily contended tag case scalability improvement from me.
- Various little fixes and cleanups from Arnd, Bart, Corentin,
Douglas, Eryu, Goldwyn, and myself"
* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
block: remove smart1,2.h
nvme: add tracepoint for nvme_complete_rq
nvme: add tracepoint for nvme_setup_cmd
nvme-pci: introduce RECONNECTING state to mark initializing procedure
nvme-rdma: remove redundant boolean for inline_data
nvme: don't free uuid pointer before printing it
nvme-pci: Suspend queues after deleting them
bsg: use pr_debug instead of hand crafted macros
blk-mq-debugfs: don't allow write on attributes with seq_operations set
nvme-pci: Fix queue double allocations
block: Set BIO_TRACE_COMPLETION on new bio during split
blk-throttle: use queue_is_rq_based
block: Remove kblockd_schedule_delayed_work{,_on}()
blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
lib/scatterlist: Fix chaining support in sgl_alloc_order()
blk-throttle: track read and write request individually
block: add bdev_read_only() checks to common helpers
block: fail op_is_write() requests to read-only partitions
blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
...
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/Makefile | 4 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 134 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 22 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 2 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 7 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 185 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 44 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 9 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 216 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 6 | ||||
-rw-r--r-- | drivers/nvme/host/trace.c | 130 | ||||
-rw-r--r-- | drivers/nvme/host/trace.h | 165 |
12 files changed, 611 insertions, 313 deletions
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index a25fd43650ad..441e67e3a9d7 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I$(src) + obj-$(CONFIG_NVME_CORE) += nvme-core.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o @@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o nvme-core-y := core.o +nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 839650e0926a..e8104871cbbf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -29,6 +29,9 @@ #include <linux/pm_qos.h> #include <asm/unaligned.h> +#define CREATE_TRACE_POINTS +#include "trace.h" + #include "nvme.h" #include "fabrics.h" @@ -65,9 +68,26 @@ static bool streams; module_param(streams, bool, 0644); MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); +/* + * nvme_wq - hosts nvme related works that are not reset or delete + * nvme_reset_wq - hosts nvme reset works + * nvme_delete_wq - hosts nvme delete works + * + * nvme_wq will host works such are scan, aen handling, fw activation, + * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq + * runs reset works which also flush works hosted on nvme_wq for + * serialization purposes. nvme_delete_wq host controller deletion + * works which flush reset works for serialization. + */ struct workqueue_struct *nvme_wq; EXPORT_SYMBOL_GPL(nvme_wq); +struct workqueue_struct *nvme_reset_wq; +EXPORT_SYMBOL_GPL(nvme_reset_wq); + +struct workqueue_struct *nvme_delete_wq; +EXPORT_SYMBOL_GPL(nvme_delete_wq); + static DEFINE_IDA(nvme_subsystems_ida); static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); @@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->reset_work)) + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) return -EBUSY; return 0; } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) flush_work(&ctrl->reset_work); return ret; } +EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); static void nvme_delete_ctrl_work(struct work_struct *work) { @@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->delete_work)) + if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) return -EBUSY; return 0; } @@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req) return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: return BLK_STS_NOSPC; + case NVME_SC_LBA_RANGE: + return BLK_STS_TARGET; + case NVME_SC_BAD_ATTRIBUTES: case NVME_SC_ONCS_NOT_SUPPORTED: + case NVME_SC_INVALID_OPCODE: + case NVME_SC_INVALID_FIELD: + case NVME_SC_INVALID_NS: return BLK_STS_NOTSUPP; case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: case NVME_SC_ACCESS_DENIED: case NVME_SC_READ_ONLY: + case NVME_SC_COMPARE_FAILED: return BLK_STS_MEDIUM; case NVME_SC_GUARD_CHECK: case NVME_SC_APPTAG_CHECK: @@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req) void nvme_complete_rq(struct request *req) { - if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { - if (nvme_req_needs_failover(req)) { + blk_status_t status = nvme_error_status(req); + + trace_nvme_complete_rq(req); + + if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { + if (nvme_req_needs_failover(req, status)) { nvme_failover_req(req); return; } @@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req) return; } } - - blk_mq_end_request(req, nvme_error_status(req)); + blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); @@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, old_state = ctrl->state; switch (new_state) { + case NVME_CTRL_ADMIN_ONLY: + switch (old_state) { + case NVME_CTRL_RECONNECTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_LIVE: switch (old_state) { case NVME_CTRL_NEW: @@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: + case NVME_CTRL_ADMIN_ONLY: changed = true; /* FALLTHRU */ default: @@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_DELETING: switch (old_state) { case NVME_CTRL_LIVE: + case NVME_CTRL_ADMIN_ONLY: case NVME_CTRL_RESETTING: case NVME_CTRL_RECONNECTING: changed = true; @@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } cmd->common.command_id = req->tag; + if (ns) + trace_nvme_setup_nvm_cmd(req->q->id, cmd); + else + trace_nvme_setup_admin_cmd(cmd); return ret; } EXPORT_SYMBOL_GPL(nvme_setup_cmd); @@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) #ifdef CONFIG_NVME_MULTIPATH /* should never be called due to GENHD_FL_HIDDEN */ if (WARN_ON_ONCE(ns->head->disk)) - return -ENXIO; + goto fail; #endif if (!kref_get_unless_zero(&ns->kref)) - return -ENXIO; + goto fail; + if (!try_module_get(ns->ctrl->ops->module)) + goto fail_put_ns; + return 0; + +fail_put_ns: + nvme_put_ns(ns); +fail: + return -ENXIO; } static void nvme_release(struct gendisk *disk, fmode_t mode) { - nvme_put_ns(disk->private_data); + struct nvme_ns *ns = disk->private_data; + + module_put(ns->ctrl->ops->module); + nvme_put_ns(ns); } static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = { NULL, }; +static int nvme_active_ctrls(struct nvme_subsystem *subsys) +{ + int count = 0; + struct nvme_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DEAD) + count++; + } + mutex_unlock(&subsys->lock); + + return count; +} + static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { struct nvme_subsystem *subsys, *found; @@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) * Verify that the subsystem actually supports multiple * controllers, else bail out. */ - if (!(id->cmic & (1 << 1))) { + if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { dev_err(ctrl->device, "ignoring ctrl due to duplicate subnqn (%s).\n", found->subnqn); @@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) shutdown_timeout, 60); if (ctrl->shutdown_timeout != shutdown_timeout) - dev_warn(ctrl->device, + dev_info(ctrl->device, "Shutdown timeout set to %u seconds\n", ctrl->shutdown_timeout); } else @@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file) struct nvme_ctrl *ctrl = container_of(inode->i_cdev, struct nvme_ctrl, cdev); - if (ctrl->state != NVME_CTRL_LIVE) + switch (ctrl->state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_ADMIN_ONLY: + break; + default: return -EWOULDBLOCK; + } + file->private_data = ctrl; return 0; } @@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, static const char *const state_name[] = { [NVME_CTRL_NEW] = "new", [NVME_CTRL_LIVE] = "live", + [NVME_CTRL_ADMIN_ONLY] = "only-admin", [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_RECONNECTING]= "reconnecting", [NVME_CTRL_DELETING] = "deleting", @@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work) if (ctrl->state != NVME_CTRL_LIVE) return; + WARN_ON_ONCE(!ctrl->tagset); + if (nvme_identify_ctrl(ctrl, &id)) return; @@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work) void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* - * Do not queue new scan work when a controller is reset during - * removal. + * Only new queue scan work when admin and IO queues are both alive */ if (ctrl->state == NVME_CTRL_LIVE) queue_work(nvme_wq, &ctrl->scan_work); @@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset); int __init nvme_core_init(void) { - int result; + int result = -ENOMEM; nvme_wq = alloc_workqueue("nvme-wq", WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!nvme_wq) - return -ENOMEM; + goto out; + + nvme_reset_wq = alloc_workqueue("nvme-reset-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_reset_wq) + goto destroy_wq; + + nvme_delete_wq = alloc_workqueue("nvme-delete-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_delete_wq) + goto destroy_reset_wq; result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); if (result < 0) - goto destroy_wq; + goto destroy_delete_wq; nvme_class = class_create(THIS_MODULE, "nvme"); if (IS_ERR(nvme_class)) { @@ -3505,8 +3596,13 @@ destroy_class: class_destroy(nvme_class); unregister_chrdev: unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); +destroy_delete_wq: + destroy_workqueue(nvme_delete_wq); +destroy_reset_wq: + destroy_workqueue(nvme_reset_wq); destroy_wq: destroy_workqueue(nvme_wq); +out: return result; } @@ -3516,6 +3612,8 @@ void nvme_core_exit(void) class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + destroy_workqueue(nvme_delete_wq); + destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 894c2ccb3891..5dd4ceefed8f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect); */ int nvmf_register_transport(struct nvmf_transport_ops *ops) { - if (!ops->create_ctrl) + if (!ops->create_ctrl || !ops->module) return -EINVAL; down_write(&nvmf_transports_rwsem); @@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } - if (uuid_parse(p, &hostid)) { + ret = uuid_parse(p, &hostid); + if (ret) { pr_err("Invalid hostid %s\n", p); ret = -EINVAL; + kfree(p); goto out; } + kfree(p); break; case NVMF_OPT_DUP_CONNECT: opts->duplicate_connect = true; @@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_unlock; } + if (!try_module_get(ops->module)) { + ret = -EBUSY; + goto out_unlock; + } + ret = nvmf_check_required_opts(opts, ops->required_opts); if (ret) - goto out_unlock; + goto out_module_put; ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | ops->allowed_opts | ops->required_opts); if (ret) - goto out_unlock; + goto out_module_put; ctrl = ops->create_ctrl(dev, opts); if (IS_ERR(ctrl)) { ret = PTR_ERR(ctrl); - goto out_unlock; + goto out_module_put; } if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { dev_warn(ctrl->device, "controller returned incorrect NQN: \"%s\".\n", ctrl->subsys->subnqn); + module_put(ops->module); up_read(&nvmf_transports_rwsem); nvme_delete_ctrl_sync(ctrl); return ERR_PTR(-EINVAL); } + module_put(ops->module); up_read(&nvmf_transports_rwsem); return ctrl; +out_module_put: + module_put(ops->module); out_unlock: up_read(&nvmf_transports_rwsem); out_free_opts: diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 9ba614953607..25b19f722f5b 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -108,6 +108,7 @@ struct nvmf_ctrl_options { * fabric implementation of NVMe fabrics. * @entry: Used by the fabrics library to add the new * registration entry to its linked-list internal tree. + * @module: Transport module reference * @name: Name of the NVMe fabric driver implementation. * @required_opts: sysfs command-line options that must be specified * when adding a new NVMe controller. @@ -126,6 +127,7 @@ struct nvmf_ctrl_options { */ struct nvmf_transport_ops { struct list_head entry; + struct module *module; const char *name; int required_opts; int allowed_opts; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 794e66e4aa20..99bf51c7e513 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); nvme_fc_free_queue(&ctrl->queues[0]); + /* re-enable the admin_q so anything new can fast fail */ + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_fc_ctlr_inactive_on_rport(ctrl); } @@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) * waiting for io to terminate */ nvme_fc_delete_association(ctrl); + + /* resume the io queues so that things will fast fail */ + nvme_start_queues(nctrl); } static void @@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) static struct nvmf_transport_ops nvme_fc_transport = { .name = "fc", + .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, .create_ctrl = nvme_fc_create_ctrl, diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ba3d7f3349e5..50ef71ee3d86 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -31,27 +31,10 @@ enum nvme_nvm_admin_opcode { nvme_nvm_admin_identity = 0xe2, - nvme_nvm_admin_get_l2p_tbl = 0xea, nvme_nvm_admin_get_bb_tbl = 0xf2, nvme_nvm_admin_set_bb_tbl = 0xf1, }; -struct nvme_nvm_hb_rw { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 slba; -}; - struct nvme_nvm_ph_rw { __u8 opcode; __u8 flags; @@ -80,19 +63,6 @@ struct nvme_nvm_identity { __u32 rsvd11[5]; }; -struct nvme_nvm_l2ptbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le32 cdw2[4]; - __le64 prp1; - __le64 prp2; - __le64 slba; - __le32 nlb; - __le16 cdw14[6]; -}; - struct nvme_nvm_getbbtbl { __u8 opcode; __u8 flags; @@ -139,9 +109,7 @@ struct nvme_nvm_command { union { struct nvme_common_command common; struct nvme_nvm_identity identity; - struct nvme_nvm_hb_rw hb_rw; struct nvme_nvm_ph_rw ph_rw; - struct nvme_nvm_l2ptbl l2p; struct nvme_nvm_getbbtbl get_bb; struct nvme_nvm_setbbtbl set_bb; struct nvme_nvm_erase_blk erase; @@ -167,7 +135,7 @@ struct nvme_nvm_id_group { __u8 num_lun; __u8 num_pln; __u8 rsvd1; - __le16 num_blk; + __le16 num_chk; __le16 num_pg; __le16 fpg_sz; __le16 csecs; @@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl { static inline void _nvme_nvm_check_size(void) { BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); @@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void) static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) { struct nvme_nvm_id_group *src; - struct nvm_id_group *dst; + struct nvm_id_group *grp; + int sec_per_pg, sec_per_pl, pg_per_blk; if (nvme_nvm_id->cgrps != 1) return -EINVAL; src = &nvme_nvm_id->groups[0]; - dst = &nvm_id->grp; - - dst->mtype = src->mtype; - dst->fmtype = src->fmtype; - dst->num_ch = src->num_ch; - dst->num_lun = src->num_lun; - dst->num_pln = src->num_pln; - - dst->num_pg = le16_to_cpu(src->num_pg); - dst->num_blk = le16_to_cpu(src->num_blk); - dst->fpg_sz = le16_to_cpu(src->fpg_sz); - dst->csecs = le16_to_cpu(src->csecs); - dst->sos = le16_to_cpu(src->sos); - - dst->trdt = le32_to_cpu(src->trdt); - dst->trdm = le32_to_cpu(src->trdm); - dst->tprt = le32_to_cpu(src->tprt); - dst->tprm = le32_to_cpu(src->tprm); - dst->tbet = le32_to_cpu(src->tbet); - dst->tbem = le32_to_cpu(src->tbem); - dst->mpos = le32_to_cpu(src->mpos); - dst->mccap = le32_to_cpu(src->mccap); - - dst->cpar = le16_to_cpu(src->cpar); - - if (dst->fmtype == NVM_ID_FMTYPE_MLC) { - memcpy(dst->lptbl.id, src->lptbl.id, 8); - dst->lptbl.mlc.num_pairs = - le16_to_cpu(src->lptbl.mlc.num_pairs); - - if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { - pr_err("nvm: number of MLC pairs not supported\n"); - return -EINVAL; - } + grp = &nvm_id->grp; + + grp->mtype = src->mtype; + grp->fmtype = src->fmtype; + + grp->num_ch = src->num_ch; + grp->num_lun = src->num_lun; + + grp->num_chk = le16_to_cpu(src->num_chk); + grp->csecs = le16_to_cpu(src->csecs); + grp->sos = le16_to_cpu(src->sos); + + pg_per_blk = le16_to_cpu(src->num_pg); + sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs; + sec_per_pl = sec_per_pg * src->num_pln; + grp->clba = sec_per_pl * pg_per_blk; + grp->ws_per_chk = pg_per_blk; - memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, - dst->lptbl.mlc.num_pairs); + grp->mpos = le32_to_cpu(src->mpos); + grp->cpar = le16_to_cpu(src->cpar); + grp->mccap = le32_to_cpu(src->mccap); + + grp->ws_opt = grp->ws_min = sec_per_pg; + grp->ws_seq = NVM_IO_SNGL_ACCESS; + + if (grp->mpos & 0x020202) { + grp->ws_seq = NVM_IO_DUAL_ACCESS; + grp->ws_opt <<= 1; + } else if (grp->mpos & 0x040404) { + grp->ws_seq = NVM_IO_QUAD_ACCESS; + grp->ws_opt <<= 2; } + grp->trdt = le32_to_cpu(src->trdt); + grp->trdm = le32_to_cpu(src->trdm); + grp->tprt = le32_to_cpu(src->tprt); + grp->tprm = le32_to_cpu(src->tprm); + grp->tbet = le32_to_cpu(src->tbet); + grp->tbem = le32_to_cpu(src->tbem); + + /* 1.2 compatibility */ + grp->num_pln = src->num_pln; + grp->num_pg = le16_to_cpu(src->num_pg); + grp->fpg_sz = le16_to_cpu(src->fpg_sz); + return 0; } @@ -332,62 +305,6 @@ out: return ret; } -static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, - nvm_l2p_update_fn *update_l2p, void *priv) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_command c = {}; - u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9; - u32 nlb_pr_rq = len / sizeof(u64); - u64 cmd_slba = slba; - void *entries; - int ret = 0; - - c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; - c.l2p.nsid = cpu_to_le32(ns->head->ns_id); - entries = kmalloc(len, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - while (nlb) { - u32 cmd_nlb = min(nlb_pr_rq, nlb); - u64 elba = slba + cmd_nlb; - - c.l2p.slba = cpu_to_le64(cmd_slba); - c.l2p.nlb = cpu_to_le32(cmd_nlb); - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, - (struct nvme_command *)&c, entries, len); - if (ret) { - dev_err(ns->ctrl->device, - "L2P table transfer failed (%d)\n", ret); - ret = -EIO; - goto out; - } - - if (unlikely(elba > nvmdev->total_secs)) { - pr_err("nvm: L2P data from device is out of bounds!\n"); - ret = -EINVAL; - goto out; - } - - /* Transform physical address to target address space */ - nvm_part_to_tgt(nvmdev, entries, cmd_nlb); - - if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) { - ret = -EINTR; - goto out; - } - - cmd_slba += cmd_nlb; - nlb -= cmd_nlb; - } - -out: - kfree(entries); - return ret; -} - static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, u8 *blks) { @@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; - int nr_blks = geo->blks_per_lun * geo->plane_mode; + int nr_blks = geo->nr_chks * geo->plane_mode; int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; int ret = 0; @@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, goto out; } - memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode); + memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode); out: kfree(bb_tbl); return ret; @@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); - - if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) - c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, - rqd->bio->bi_iter.bi_sector)); } static void nvme_nvm_end_io(struct request *rq, blk_status_t status) @@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr, static struct nvm_dev_ops nvme_nvm_dev_ops = { .identity = nvme_nvm_identity, - .get_l2p_tbl = nvme_nvm_get_l2p_tbl, - .get_bb_tbl = nvme_nvm_get_bb_tbl, .set_bb_tbl = nvme_nvm_set_bb_tbl, @@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev, } else if (strcmp(attr->name, "num_planes") == 0) { return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ - return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk); } else if (strcmp(attr->name, "num_pages") == 0) { return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); } else if (strcmp(attr->name, "page_size") == 0) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1218a9fca846..3b211d9e58b8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req) kblockd_schedule_work(&ns->head->requeue_work); } -bool nvme_req_needs_failover(struct request *req) +bool nvme_req_needs_failover(struct request *req, blk_status_t error) { if (!(req->cmd_flags & REQ_NVME_MPATH)) return false; - - switch (nvme_req(req)->status & 0x7ff) { - /* - * Generic command status: - */ - case NVME_SC_INVALID_OPCODE: - case NVME_SC_INVALID_FIELD: - case NVME_SC_INVALID_NS: - case NVME_SC_LBA_RANGE: - case NVME_SC_CAP_EXCEEDED: - case NVME_SC_RESERVATION_CONFLICT: - return false; - - /* - * I/O command set specific error. Unfortunately these values are - * reused for fabrics commands, but those should never get here. - */ - case NVME_SC_BAD_ATTRIBUTES: - case NVME_SC_INVALID_PI: - case NVME_SC_READ_ONLY: - case NVME_SC_ONCS_NOT_SUPPORTED: - WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == - nvme_fabrics_command); - return false; - - /* - * Media and Data Integrity Errors: - */ - case NVME_SC_WRITE_FAULT: - case NVME_SC_READ_ERROR: - case NVME_SC_GUARD_CHECK: - case NVME_SC_APPTAG_CHECK: - case NVME_SC_REFTAG_CHECK: - case NVME_SC_COMPARE_FAILED: - case NVME_SC_ACCESS_DENIED: - case NVME_SC_UNWRITTEN_BLOCK: - return false; - } - - /* Everything else could be a path failure, so should be retried */ - return true; + return blk_path_error(error); } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a00eabd06427..8e4550fa08f8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -32,6 +32,8 @@ extern unsigned int admin_timeout; #define NVME_KATO_GRACE 10 extern struct workqueue_struct *nvme_wq; +extern struct workqueue_struct *nvme_reset_wq; +extern struct workqueue_struct *nvme_delete_wq; enum { NVME_NS_LBA = 0, @@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req) enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, + NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ NVME_CTRL_RESETTING, NVME_CTRL_RECONNECTING, NVME_CTRL_DELETING, @@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); @@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH void nvme_failover_req(struct request *req); -bool nvme_req_needs_failover(struct request *req); +bool nvme_req_needs_failover(struct request *req, blk_status_t error); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns_head *head); @@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) static inline void nvme_failover_req(struct request *req) { } -static inline bool nvme_req_needs_failover(struct request *req) +static inline bool nvme_req_needs_failover(struct request *req, + blk_status_t error) { return false; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4276ebfff22b..6fe7af00a1f4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); * Represents an NVM Express device. Each nvme_dev is a PCI function. */ struct nvme_dev { - struct nvme_queue **queues; + struct nvme_queue *queues; struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; @@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; WARN_ON(hctx_idx != 0); WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); @@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; if (!nvmeq->tags) nvmeq->tags = &dev->tagset.tags[hctx_idx]; @@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; - struct nvme_queue *nvmeq = dev->queues[queue_idx]; + struct nvme_queue *nvmeq = &dev->queues[queue_idx]; BUG_ON(!nvmeq); iod->nvmeq = nvmeq; @@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); @@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) */ bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); - /* If there is a reset ongoing, we shouldn't reset again. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) + /* If there is a reset/reinit ongoing, we shouldn't reset again. */ + switch (dev->ctrl.state) { + case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: return false; + default: + break; + } /* We shouldn't reset unless the controller is on fatal error state * _or_ if we lost the communication with it. @@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) if (nvmeq->sq_cmds) dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); } static void nvme_free_queues(struct nvme_dev *dev, int lowest) @@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) int i; for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; dev->ctrl.queue_count--; - dev->queues[i] = NULL; - nvme_free_queue(nvmeq); + nvme_free_queue(&dev->queues[i]); } } @@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { - struct nvme_queue *nvmeq = dev->queues[0]; - - if (!nvmeq) - return; - if (nvme_suspend_queue(nvmeq)) - return; + struct nvme_queue *nvmeq = &dev->queues[0]; if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); @@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { - if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { + if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), dev->ctrl.page_size); nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; @@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } -static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int node) +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth, int node) { - struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, - node); - if (!nvmeq) - return NULL; + struct nvme_queue *nvmeq = &dev->queues[qid]; + + if (dev->ctrl.queue_count > qid) + return 0; nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL); @@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_depth = depth; nvmeq->qid = qid; nvmeq->cq_vector = -1; - dev->queues[qid] = nvmeq; dev->ctrl.queue_count++; - return nvmeq; + return 0; free_cqdma: dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: - kfree(nvmeq); - return NULL; + return -ENOMEM; } static int queue_request_irq(struct nvme_queue *nvmeq) @@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = dev->queues[0]; - if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, - dev_to_node(dev->dev)); - if (!nvmeq) - return -ENOMEM; - } + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, + dev_to_node(dev->dev)); + if (result) + return result; + nvmeq = &dev->queues[0]; aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; @@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { /* vector == qid - 1, match nvme_create_queue */ - if (!nvme_alloc_queue(dev, i, dev->q_depth, + if (nvme_alloc_queue(dev, i, dev->q_depth, pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ret = -ENOMEM; break; @@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev) max = min(dev->max_qid, dev->ctrl.queue_count - 1); for (i = dev->online_queues; i <= max; i++) { - ret = nvme_create_queue(dev->queues[i], i); + ret = nvme_create_queue(&dev->queues[i], i); if (ret) break; } /* * Ignore failing Create SQ/CQ commands, we can continue with less - * than the desired aount of queues, and even a controller without - * I/O queues an still be used to issue admin commands. This might + * than the desired amount of queues, and even a controller without + * I/O queues can still be used to issue admin commands. This might * be useful to upgrade a buggy firmware for example. */ return ret >= 0 ? 0 : ret; @@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev, } static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); -static void __iomem *nvme_map_cmb(struct nvme_dev *dev) +static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { - u64 szu, size, offset; + u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; + + return 1ULL << (12 + 4 * szu); +} + +static u32 nvme_cmb_size(struct nvme_dev *dev) +{ + return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; +} + +static void nvme_map_cmb(struct nvme_dev *dev) +{ + u64 size, offset; resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); - void __iomem *cmb; int bar; dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); - if (!(NVME_CMB_SZ(dev->cmbsz))) - return NULL; + if (!dev->cmbsz) + return; dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); if (!use_cmb_sqes) - return NULL; + return; - szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); - size = szu * NVME_CMB_SZ(dev->cmbsz); - offset = szu * NVME_CMB_OFST(dev->cmbloc); + size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); + offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); bar = NVME_CMB_BIR(dev->cmbloc); bar_size = pci_resource_len(pdev, bar); if (offset > bar_size) - return NULL; + return; /* * Controllers may support a CMB size larger than their BAR, @@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (size > bar_size - offset) size = bar_size - offset; - cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); - if (!cmb) - return NULL; - + dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); + if (!dev->cmb) + return; dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_size = size; - return cmb; + + if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, + &dev_attr_cmb.attr, NULL)) + dev_warn(dev->ctrl.device, + "failed to add sysfs attribute for CMB\n"); } static inline void nvme_release_cmb(struct nvme_dev *dev) @@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, dma_addr_t descs_dma; int i = 0; void **bufs; - u64 size = 0, tmp; + u64 size, tmp; tmp = (preferred + chunk_size - 1); do_div(tmp, chunk_size); @@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) u64 preferred = (u64)dev->ctrl.hmpre * 4096; u64 min = (u64)dev->ctrl.hmmin * 4096; u32 enable_bits = NVME_HOST_MEM_ENABLE; - int ret = 0; + int ret; preferred = min(preferred, max); if (min > max) { @@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = dev->queues[0]; + struct nvme_queue *adminq = &dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); int result, nr_io_queues; unsigned long size; @@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { + if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); if (result > 0) @@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) return 0; } -static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) +static void nvme_disable_io_queues(struct nvme_dev *dev) { - int pass; + int pass, queues = dev->online_queues - 1; unsigned long timeout; u8 opcode = nvme_admin_delete_sq; @@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) retry: timeout = ADMIN_TIMEOUT; for (; i > 0; i--, sent++) - if (nvme_delete_queue(dev->queues[i], opcode)) + if (nvme_delete_queue(&dev->queues[i], opcode)) break; while (sent--) { @@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) } /* - * Return: error value if an error occurred setting up the queues or calling - * Identify Device. 0 if these succeeded, even if adding some of the - * namespaces failed. At the moment, these failures are silent. TBD which - * failures should be reported. + * return error value only when tagset allocation failed */ static int nvme_dev_add(struct nvme_dev *dev) { + int ret; + if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; @@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; - if (blk_mq_alloc_tag_set(&dev->tagset)) - return 0; + ret = blk_mq_alloc_tag_set(&dev->tagset); + if (ret) { + dev_warn(dev->ctrl.device, + "IO queues tagset allocation failed %d\n", ret); + return ret; + } dev->ctrl.tagset = &dev->tagset; nvme_dbbuf_set(dev); @@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } - /* - * CMBs can currently only exist on >=1.2 PCIe devices. We only - * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group - * has no name we can pass NULL as final argument to - * sysfs_add_file_to_group. - */ - - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) { - dev->cmb = nvme_map_cmb(dev); - if (dev->cmb) { - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); - } - } + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); @@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - int i, queues; + int i; bool dead = true; struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_stop_queues(&dev->ctrl); - queues = dev->online_queues - 1; - for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(dev->queues[i]); - - if (dead) { - /* A device might become IO incapable very soon during - * probe, before the admin queue is configured. Thus, - * queue_count can be 0 here. - */ - if (dev->ctrl.queue_count) - nvme_suspend_queue(dev->queues[0]); - } else { - nvme_disable_io_queues(dev, queues); + if (!dead) { + nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); } + for (i = dev->ctrl.queue_count - 1; i >= 0; i--) + nvme_suspend_queue(&dev->queues[i]); + nvme_pci_disable(dev); blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); @@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work) container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; + enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) goto out; @@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); + /* + * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the + * initializing procedure here. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller RECONNECTING\n"); + goto out; + } + result = nvme_pci_enable(dev); if (result) goto out; @@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work) dev_warn(dev->ctrl.device, "IO queues not created\n"); nvme_kill_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); + new_state = NVME_CTRL_ADMIN_ONLY; } else { nvme_start_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); - nvme_dev_add(dev); + /* hit this only when allocate tagset fails */ + if (nvme_dev_add(dev)) + new_state = NVME_CTRL_ADMIN_ONLY; nvme_unfreeze(&dev->ctrl); } - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { - dev_warn(dev->ctrl.device, "failed to mark controller live\n"); + /* + * If only admin queue live, keep it to do further investigation or + * recovery. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { + dev_warn(dev->ctrl.device, + "failed to mark controller state %d\n", new_state); goto out; } @@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), - GFP_KERNEL, node); + + dev->queues = kcalloc_node(num_possible_cpus() + 1, + sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) goto free; @@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - queue_work(nvme_wq, &dev->ctrl.reset_work); + nvme_reset_ctrl(&dev->ctrl); + return 0; release_pools: @@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev) static void nvme_reset_done(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_reset_ctrl(&dev->ctrl); + nvme_reset_ctrl_sync(&dev->ctrl); } static void nvme_shutdown(struct pci_dev *pdev) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2a0bba7f50cf..2bc059f7d73c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -66,7 +66,6 @@ struct nvme_rdma_request { struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; u32 num_sge; int nents; - bool inline_data; struct ib_reg_wr reg_wr; struct ib_cqe reg_cqe; struct nvme_rdma_queue *queue; @@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; - req->inline_data = true; req->num_sge++; return 0; } @@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, int count, ret; req->num_sge = 1; - req->inline_data = false; refcount_set(&req->ref, 2); /* send and recv completions */ c->common.flags |= NVME_CMD_SGL_METABUF; @@ -2018,6 +2015,7 @@ out_free_ctrl: static struct nvmf_transport_ops nvme_rdma_transport = { .name = "rdma", + .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, @@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) } mutex_unlock(&nvme_rdma_ctrl_mutex); - flush_workqueue(nvme_wq); + flush_workqueue(nvme_delete_wq); } static struct ib_client nvme_rdma_ib_client = { diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c new file mode 100644 index 000000000000..41944bbef835 --- /dev/null +++ b/drivers/nvme/host/trace.c @@ -0,0 +1,130 @@ +/* + * NVM Express device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <asm/unaligned.h> +#include "trace.h" + +static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 sqid = get_unaligned_le16(cdw10); + u16 qsize = get_unaligned_le16(cdw10 + 2); + u16 sq_flags = get_unaligned_le16(cdw10 + 4); + u16 cqid = get_unaligned_le16(cdw10 + 6); + + + trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u", + sqid, qsize, sq_flags, cqid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 cqid = get_unaligned_le16(cdw10); + u16 qsize = get_unaligned_le16(cdw10 + 2); + u16 cq_flags = get_unaligned_le16(cdw10 + 4); + u16 irq_vector = get_unaligned_le16(cdw10 + 6); + + trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u", + cqid, qsize, cq_flags, irq_vector); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 cns = cdw10[0]; + u16 ctrlid = get_unaligned_le16(cdw10 + 2); + + trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid); + trace_seq_putc(p, 0); + + return ret; +} + + + +static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u16 length = get_unaligned_le16(cdw10 + 8); + u16 control = get_unaligned_le16(cdw10 + 10); + u32 dsmgmt = get_unaligned_le32(cdw10 + 12); + u32 reftag = get_unaligned_le32(cdw10 + 16); + + trace_seq_printf(p, + "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", + slba, length, control, dsmgmt, reftag); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "nr=%u, attributes=%u", + get_unaligned_le32(cdw10), + get_unaligned_le32(cdw10 + 4)); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "cdw10=%*ph", 24, cdw10); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_admin_create_sq: + return nvme_trace_create_sq(p, cdw10); + case nvme_admin_create_cq: + return nvme_trace_create_cq(p, cdw10); + case nvme_admin_identify: + return nvme_trace_admin_identify(p, cdw10); + default: + return nvme_trace_common(p, cdw10); + } +} + +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_write_zeroes: + return nvme_trace_read_write(p, cdw10); + case nvme_cmd_dsm: + return nvme_trace_dsm(p, cdw10); + default: + return nvme_trace_common(p, cdw10); + } +} diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h new file mode 100644 index 000000000000..ea91fccd1bc0 --- /dev/null +++ b/drivers/nvme/host/trace.h @@ -0,0 +1,165 @@ +/* + * NVM Express device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nvme + +#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NVME_H + +#include <linux/nvme.h> +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "nvme.h" + +#define nvme_admin_opcode_name(opcode) { opcode, #opcode } +#define show_admin_opcode_name(val) \ + __print_symbolic(val, \ + nvme_admin_opcode_name(nvme_admin_delete_sq), \ + nvme_admin_opcode_name(nvme_admin_create_sq), \ + nvme_admin_opcode_name(nvme_admin_get_log_page), \ + nvme_admin_opcode_name(nvme_admin_delete_cq), \ + nvme_admin_opcode_name(nvme_admin_create_cq), \ + nvme_admin_opcode_name(nvme_admin_identify), \ + nvme_admin_opcode_name(nvme_admin_abort_cmd), \ + nvme_admin_opcode_name(nvme_admin_set_features), \ + nvme_admin_opcode_name(nvme_admin_get_features), \ + nvme_admin_opcode_name(nvme_admin_async_event), \ + nvme_admin_opcode_name(nvme_admin_ns_mgmt), \ + nvme_admin_opcode_name(nvme_admin_activate_fw), \ + nvme_admin_opcode_name(nvme_admin_download_fw), \ + nvme_admin_opcode_name(nvme_admin_ns_attach), \ + nvme_admin_opcode_name(nvme_admin_keep_alive), \ + nvme_admin_opcode_name(nvme_admin_directive_send), \ + nvme_admin_opcode_name(nvme_admin_directive_recv), \ + nvme_admin_opcode_name(nvme_admin_dbbuf), \ + nvme_admin_opcode_name(nvme_admin_format_nvm), \ + nvme_admin_opcode_name(nvme_admin_security_send), \ + nvme_admin_opcode_name(nvme_admin_security_recv), \ + nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) + +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +#define __parse_nvme_admin_cmd(opcode, cdw10) \ + nvme_trace_parse_admin_cmd(p, opcode, cdw10) + +#define nvme_opcode_name(opcode) { opcode, #opcode } +#define show_opcode_name(val) \ + __print_symbolic(val, \ + nvme_opcode_name(nvme_cmd_flush), \ + nvme_opcode_name(nvme_cmd_write), \ + nvme_opcode_name(nvme_cmd_read), \ + nvme_opcode_name(nvme_cmd_write_uncor), \ + nvme_opcode_name(nvme_cmd_compare), \ + nvme_opcode_name(nvme_cmd_write_zeroes), \ + nvme_opcode_name(nvme_cmd_dsm), \ + nvme_opcode_name(nvme_cmd_resv_register), \ + nvme_opcode_name(nvme_cmd_resv_report), \ + nvme_opcode_name(nvme_cmd_resv_acquire), \ + nvme_opcode_name(nvme_cmd_resv_release)) + +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +#define __parse_nvme_cmd(opcode, cdw10) \ + nvme_trace_parse_nvm_cmd(p, opcode, cdw10) + +TRACE_EVENT(nvme_setup_admin_cmd, + TP_PROTO(struct nvme_command *cmd), + TP_ARGS(cmd), + TP_STRUCT__entry( + __field(u8, opcode) + __field(u8, flags) + __field(u16, cid) + __field(u64, metadata) + __array(u8, cdw10, 24) + ), + TP_fast_assign( + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), + TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->cid, __entry->flags, __entry->metadata, + show_admin_opcode_name(__entry->opcode), + __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10)) +); + + +TRACE_EVENT(nvme_setup_nvm_cmd, + TP_PROTO(int qid, struct nvme_command *cmd), + TP_ARGS(qid, cmd), + TP_STRUCT__entry( + __field(int, qid) + __field(u8, opcode) + __field(u8, flags) + __field(u16, cid) + __field(u32, nsid) + __field(u64, metadata) + __array(u8, cdw10, 24) + ), + TP_fast_assign( + __entry->qid = qid; + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), + TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->qid, __entry->nsid, __entry->cid, + __entry->flags, __entry->metadata, + show_opcode_name(__entry->opcode), + __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) +); + +TRACE_EVENT(nvme_complete_rq, + TP_PROTO(struct request *req), + TP_ARGS(req), + TP_STRUCT__entry( + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u8, retries) + __field(u8, flags) + __field(u16, status) + ), + TP_fast_assign( + __entry->qid = req->q->id; + __entry->cid = req->tag; + __entry->result = le64_to_cpu(nvme_req(req)->result.u64); + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; + ), + TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->cid, __entry->qid, __entry->result, + __entry->retries, __entry->flags, __entry->status) + +); + +#endif /* _TRACE_NVME_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> |