summaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r--drivers/nvme/host/core.c257
1 files changed, 250 insertions, 7 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 44a1a257e0b5..25ec4e585220 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -26,6 +26,7 @@
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/t10-pi.h>
+#include <linux/pm_qos.h>
#include <scsi/sg.h>
#include <asm/unaligned.h>
@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
+static unsigned long default_ps_max_latency_us = 25000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+ "max power saving latency for new devices; use PM QOS to change per device");
+
static LIST_HEAD(nvme_ctrl_list);
static DEFINE_SPINLOCK(dev_list_lock);
@@ -560,7 +566,7 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
c.identify.opcode = nvme_admin_identify;
- c.identify.cns = cpu_to_le32(NVME_ID_CNS_CTRL);
+ c.identify.cns = NVME_ID_CNS_CTRL;
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
if (!*id)
@@ -578,7 +584,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
struct nvme_command c = { };
c.identify.opcode = nvme_admin_identify;
- c.identify.cns = cpu_to_le32(NVME_ID_CNS_NS_ACTIVE_LIST);
+ c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
c.identify.nsid = cpu_to_le32(nsid);
return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
}
@@ -590,8 +596,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
int error;
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
- c.identify.opcode = nvme_admin_identify,
- c.identify.nsid = cpu_to_le32(nsid),
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.nsid = cpu_to_le32(nsid);
+ c.identify.cns = NVME_ID_CNS_NS;
*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
if (!*id)
@@ -1251,6 +1258,176 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
blk_queue_write_cache(q, vwc, vwc);
}
+static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+ /*
+ * APST (Autonomous Power State Transition) lets us program a
+ * table of power state transitions that the controller will
+ * perform automatically. We configure it with a simple
+ * heuristic: we are willing to spend at most 2% of the time
+ * transitioning between power states. Therefore, when running
+ * in any given state, we will enter the next lower-power
+ * non-operational state after waiting 100 * (enlat + exlat)
+ * microseconds, as long as that state's total latency is under
+ * the requested maximum latency.
+ *
+ * We will not autonomously enter any non-operational state for
+ * which the total latency exceeds ps_max_latency_us. Users
+ * can set ps_max_latency_us to zero to turn off APST.
+ */
+
+ unsigned apste;
+ struct nvme_feat_auto_pst *table;
+ int ret;
+
+ /*
+ * If APST isn't supported or if we haven't been initialized yet,
+ * then don't do anything.
+ */
+ if (!ctrl->apsta)
+ return;
+
+ if (ctrl->npss > 31) {
+ dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+ return;
+ }
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return;
+
+ if (ctrl->ps_max_latency_us == 0) {
+ /* Turn off APST. */
+ apste = 0;
+ } else {
+ __le64 target = cpu_to_le64(0);
+ int state;
+
+ /*
+ * Walk through all states from lowest- to highest-power.
+ * According to the spec, lower-numbered states use more
+ * power. NPSS, despite the name, is the index of the
+ * lowest-power state, not the number of states.
+ */
+ for (state = (int)ctrl->npss; state >= 0; state--) {
+ u64 total_latency_us, transition_ms;
+
+ if (target)
+ table->entries[state] = target;
+
+ /*
+ * Is this state a useful non-operational state for
+ * higher-power states to autonomously transition to?
+ */
+ if (!(ctrl->psd[state].flags &
+ NVME_PS_FLAGS_NON_OP_STATE))
+ continue;
+
+ total_latency_us =
+ (u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
+ + le32_to_cpu(ctrl->psd[state].exit_lat);
+ if (total_latency_us > ctrl->ps_max_latency_us)
+ continue;
+
+ /*
+ * This state is good. Use it as the APST idle
+ * target for higher power states.
+ */
+ transition_ms = total_latency_us + 19;
+ do_div(transition_ms, 20);
+ if (transition_ms > (1 << 24) - 1)
+ transition_ms = (1 << 24) - 1;
+
+ target = cpu_to_le64((state << 3) |
+ (transition_ms << 8));
+ }
+
+ apste = 1;
+ }
+
+ ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+ table, sizeof(*table), NULL);
+ if (ret)
+ dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+ kfree(table);
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ u64 latency;
+
+ switch (val) {
+ case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+ case PM_QOS_LATENCY_ANY:
+ latency = U64_MAX;
+ break;
+
+ default:
+ latency = val;
+ }
+
+ if (ctrl->ps_max_latency_us != latency) {
+ ctrl->ps_max_latency_us = latency;
+ nvme_configure_apst(ctrl);
+ }
+}
+
+struct nvme_core_quirk_entry {
+ /*
+ * NVMe model and firmware strings are padded with spaces. For
+ * simplicity, strings in the quirk table are padded with NULLs
+ * instead.
+ */
+ u16 vid;
+ const char *mn;
+ const char *fr;
+ unsigned long quirks;
+};
+
+static const struct nvme_core_quirk_entry core_quirks[] = {
+ /*
+ * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
+ * the controller to go out to lunch. It dies when the watchdog
+ * timer reads CSTS and gets 0xffffffff.
+ */
+ {
+ .vid = 0x144d,
+ .fr = "BXW75D0Q",
+ .quirks = NVME_QUIRK_NO_APST,
+ },
+};
+
+/* match is null-terminated but idstr is space-padded. */
+static bool string_matches(const char *idstr, const char *match, size_t len)
+{
+ size_t matchlen;
+
+ if (!match)
+ return true;
+
+ matchlen = strlen(match);
+ WARN_ON_ONCE(matchlen > len);
+
+ if (memcmp(idstr, match, matchlen))
+ return false;
+
+ for (; matchlen < len; matchlen++)
+ if (idstr[matchlen] != ' ')
+ return false;
+
+ return true;
+}
+
+static bool quirk_matches(const struct nvme_id_ctrl *id,
+ const struct nvme_core_quirk_entry *q)
+{
+ return q->vid == le16_to_cpu(id->vid) &&
+ string_matches(id->mn, q->mn, sizeof(id->mn)) &&
+ string_matches(id->fr, q->fr, sizeof(id->fr));
+}
+
/*
* Initialize the cached copies of the Identify data and various controller
* register in our nvme_ctrl structure. This should be called as soon as
@@ -1262,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
+ u8 prev_apsta;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
if (ret) {
@@ -1285,6 +1463,24 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
return -EIO;
}
+ if (!ctrl->identified) {
+ /*
+ * Check for quirks. Quirk can depend on firmware version,
+ * so, in principle, the set of quirks present can change
+ * across a reset. As a possible future enhancement, we
+ * could re-scan for quirks every time we reinitialize
+ * the device, but we'd have to make sure that the driver
+ * behaves intelligently if the quirks change.
+ */
+
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
+ if (quirk_matches(id, &core_quirks[i]))
+ ctrl->quirks |= core_quirks[i].quirks;
+ }
+ }
+
ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->vid = le16_to_cpu(id->vid);
ctrl->oncs = le16_to_cpup(&id->oncs);
@@ -1305,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
+ ctrl->npss = id->npss;
+ prev_apsta = ctrl->apsta;
+ ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+ memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
if (ctrl->ops->is_fabrics) {
ctrl->icdoff = le16_to_cpu(id->icdoff);
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
@@ -1328,6 +1529,16 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
kfree(id);
+
+ if (ctrl->apsta && !prev_apsta)
+ dev_pm_qos_expose_latency_tolerance(ctrl->device);
+ else if (!ctrl->apsta && prev_apsta)
+ dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+ nvme_configure_apst(ctrl);
+
+ ctrl->identified = true;
+
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_identify);
@@ -1577,6 +1788,29 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev,
}
static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
+static ssize_t nvme_sysfs_show_state(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ static const char *const state_name[] = {
+ [NVME_CTRL_NEW] = "new",
+ [NVME_CTRL_LIVE] = "live",
+ [NVME_CTRL_RESETTING] = "resetting",
+ [NVME_CTRL_RECONNECTING]= "reconnecting",
+ [NVME_CTRL_DELETING] = "deleting",
+ [NVME_CTRL_DEAD] = "dead",
+ };
+
+ if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
+ state_name[ctrl->state])
+ return sprintf(buf, "%s\n", state_name[ctrl->state]);
+
+ return sprintf(buf, "unknown state\n");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
+
static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -1609,6 +1843,7 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_transport.attr,
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
+ &dev_attr_state.attr,
NULL
};
@@ -2065,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
list_add_tail(&ctrl->node, &nvme_ctrl_list);
spin_unlock(&dev_list_lock);
+ /*
+ * Initialize latency tolerance controls. The sysfs files won't
+ * be visible to userspace unless the device actually supports APST.
+ */
+ ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+ dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+ min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
return 0;
out_release_instance:
nvme_release_instance(ctrl);
@@ -2090,9 +2333,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
* Revalidating a dead namespace sets capacity to 0. This will
* end buffered writers dirtying pages that can't be synced.
*/
- if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags))
- revalidate_disk(ns->disk);
-
+ if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ continue;
+ revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
blk_mq_abort_requeue_list(ns->queue);
blk_mq_start_stopped_hw_queues(ns->queue, true);