From d34cb808402898e53b9a9bcbbedd01667a78723b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 25 Sep 2017 11:01:31 -0700
Subject: libnvdimm, dimm: clear 'locked' status on successful DIMM enable

If we successfully enable a DIMM then it must not be locked and we can
clear the label-read failure condition. Otherwise, we need to reload the
entire bus provider driver to achieve the same effect, and that can
disrupt unrelated DIMMs and namespaces.

Fixes: 9d62ed965118 ("libnvdimm: handle locked label storage areas")
Cc: <stable@vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/dimm.c      | 1 +
 drivers/nvdimm/dimm_devs.c | 7 +++++++
 drivers/nvdimm/nd.h        | 1 +
 3 files changed, 9 insertions(+)

diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index e0f0e3ce1a32..98466d762c8f 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -68,6 +68,7 @@ static int nvdimm_probe(struct device *dev)
 	rc = nd_label_reserve_dpa(ndd);
 	if (ndd->ns_current >= 0)
 		nvdimm_set_aliasing(dev);
+	nvdimm_clear_locked(dev);
 	nvdimm_bus_unlock(dev);
 
 	if (rc)
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index f0d1b7e5de01..5f1385b96b13 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev)
 	set_bit(NDD_LOCKED, &nvdimm->flags);
 }
 
+void nvdimm_clear_locked(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	clear_bit(NDD_LOCKED, &nvdimm->flags);
+}
+
 static void nvdimm_release(struct device *dev)
 {
 	struct nvdimm *nvdimm = to_nvdimm(dev);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 9c758a91372b..156be00e1f76 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -254,6 +254,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		unsigned int len);
 void nvdimm_set_aliasing(struct device *dev);
 void nvdimm_set_locked(struct device *dev);
+void nvdimm_clear_locked(struct device *dev);
 struct nd_btt *to_nd_btt(struct device *dev);
 
 struct nd_gen_sb {
-- 
cgit v1.2.3


From d083e6dae083ba3576ee454d5cab4491d9c21ec5 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Mon, 25 Sep 2017 16:11:13 +0530
Subject: dax: pr_err() strings should end with newlines

pr_err() messages should terminated with a new-line to avoid
other messages being concatenated onto the end.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 557b93703532..30d8a5aedd23 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	put_dax(dax_dev);
 
 	if (len < 1) {
-		pr_err("VFS (%s): error: dax access failed (%ld)",
+		pr_err("VFS (%s): error: dax access failed (%ld)\n",
 				sb->s_id, len);
 		return len < 0 ? len : -EIO;
 	}
-- 
cgit v1.2.3


From b8ff981f88df03c72a4de2f6eaa9ce447a10ac03 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 26 Sep 2017 11:17:52 -0700
Subject: libnvdimm, region : make 'resource' attribute only readable by root

For the same reason that /proc/iomem returns 0's for non-root readers
and acpi tables are root-only, make the 'resource' attribute for region
devices only readable by root. Otherwise we disclose physical address
information.

Fixes: 802f4be6feee ("libnvdimm: Add 'resource' sysfs attribute to regions")
Cc: <stable@vger.kernel.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/region_devs.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 829d760f651c..abaf38c61220 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
 		return 0;
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
-		return 0;
+	if (a == &dev_attr_resource.attr) {
+		if (is_nd_pmem(dev))
+			return 0400;
+		else
+			return 0;
+	}
 
 	if (a == &dev_attr_deep_flush.attr) {
 		int has_flush = nvdimm_has_flush(nd_region);
-- 
cgit v1.2.3


From c1fb3542074fd0c4d901d778bd52455111e4eb6f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 26 Sep 2017 11:21:24 -0700
Subject: libnvdimm, namespace: make 'resource' attribute only readable by root

For the same reason that /proc/iomem returns 0's for non-root readers
and acpi tables are root-only, make the 'resource' attribute for
namespace devices only readable by root. Otherwise we disclose physical
address information.

Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation")
Cc: <stable@vger.kernel.org>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/namespace_devs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 3e4d1e7998da..0af988739a06 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj,
 	if (a == &dev_attr_resource.attr) {
 		if (is_namespace_blk(dev))
 			return 0;
-		return a->mode;
+		return 0400;
 	}
 
 	if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
-- 
cgit v1.2.3


From 26417ae4fc6108f8db436f24108b08f68bdc520e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 26 Sep 2017 13:07:06 -0700
Subject: libnvdimm, pfn: make 'resource' attribute only readable by root

For the same reason that /proc/iomem returns 0's for non-root readers
and acpi tables are root-only, make the 'resource' attribute for pfn
devices only readable by root. Otherwise we disclose physical address
information.

Fixes: f6ed58c70d14 ("libnvdimm, pfn: 'resource'-address and 'size'...")
Cc: <stable@vger.kernel.org>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pfn_devs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 9576c444f0ab..65cc171c721d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = {
 	NULL,
 };
 
+static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	if (a == &dev_attr_resource.attr)
+		return 0400;
+	return a->mode;
+}
+
 struct attribute_group nd_pfn_attribute_group = {
 	.attrs = nd_pfn_attributes,
+	.is_visible = pfn_visible,
 };
 
 static const struct attribute_group *nd_pfn_attribute_groups[] = {
-- 
cgit v1.2.3


From b18d4b8a25af6fe83d7692191d6ff962ea611c4f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 26 Sep 2017 11:41:28 -0700
Subject: libnvdimm, namespace: fix label initialization to use valid seq
 numbers

The set of valid sequence numbers is {1,2,3}. The specification
indicates that an implementation should consider 0 a sign of a critical
error:

    UEFI 2.7: 13.19 NVDIMM Label Protocol

    Software never writes the sequence number 00, so a correctly
    check-summed Index Block with this sequence number probably indicates a
    critical error. When software discovers this case it treats it as an
    invalid Index Block indication.

While the expectation is that the invalid block is just thrown away, the
Robustness Principle says we should fix this to make both sequence
numbers valid.

Fixes: f524bf271a5c ("libnvdimm: write pmem label set")
Cc: <stable@vger.kernel.org>
Reported-by: Juston Li <juston.li@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/label.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 9c5f108910e3..de66c02f6140 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
 	nsindex = to_namespace_index(ndd, 0);
 	memset(nsindex, 0, ndd->nsarea.config_size);
 	for (i = 0; i < 2; i++) {
-		int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT);
+		int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
 
 		if (rc)
 			return rc;
-- 
cgit v1.2.3


From a586cb4937fa328a857c8d7fbefb986d347bd4a7 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 22 Sep 2017 16:46:26 +0900
Subject: nfit_test Make private definitions to command emulation

Move private definitions to command emulation.
These definitions were originally defined at include/uapi/linux/ndctl.h,
but they are used at only nfit_test emulation now.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit_test.h | 47 +++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index d3d63dd5ed38..52c83be9dcfa 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -32,6 +32,53 @@ struct nfit_test_resource {
 	void *buf;
 };
 
+#define ND_TRANSLATE_SPA_STATUS_INVALID_SPA  2
+
+/* nfit commands */
+enum nfit_cmd_num {
+	NFIT_CMD_TRANSLATE_SPA = 5,
+	NFIT_CMD_ARS_INJECT_SET = 7,
+	NFIT_CMD_ARS_INJECT_CLEAR = 8,
+	NFIT_CMD_ARS_INJECT_GET = 9,
+};
+
+struct nd_cmd_translate_spa {
+	__u64 spa;
+	__u32 status;
+	__u8  flags;
+	__u8  _reserved[3];
+	__u64 translate_length;
+	__u32 num_nvdimms;
+	struct nd_nvdimm_device {
+		__u32 nfit_device_handle;
+		__u32 _reserved;
+		__u64 dpa;
+	} __packed devices[0];
+
+} __packed;
+
+struct nd_cmd_ars_err_inj {
+	__u64 err_inj_spa_range_base;
+	__u64 err_inj_spa_range_length;
+	__u8  err_inj_options;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_clr {
+	__u64 err_inj_clr_spa_range_base;
+	__u64 err_inj_clr_spa_range_length;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_stat {
+	__u32 status;
+	__u32 inj_err_rec_count;
+	struct nd_error_stat_query_record {
+		__u64 err_inj_stat_spa_range_base;
+		__u64 err_inj_stat_spa_range_length;
+	} __packed record[0];
+} __packed;
+
 union acpi_object;
 typedef void *acpi_handle;
 
-- 
cgit v1.2.3


From b37b3fd33d034470749db55d1ccd7331bd35bdca Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 22 Sep 2017 16:47:40 +0900
Subject: acpi nfit: Enable to show what feature is supported via ND_CMD_CALL
 for nfit_test

Though nfit_test need to show what feature is supported via ND_CMD_CALL on
device/nfit/dsm_mask, currently there is no way to tell it.
This patch makes to enable it.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 1 +
 drivers/acpi/nfit/nfit.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 9c2c49b6a240..3369bb8fd9ba 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1645,6 +1645,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	int i;
 
 	nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
+	nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return;
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 54292db61262..f2c6380bdbb2 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -167,6 +167,7 @@ struct acpi_nfit_desc {
 	unsigned int init_complete:1;
 	unsigned long dimm_cmd_force_en;
 	unsigned long bus_cmd_force_en;
+	unsigned long bus_nfit_cmd_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 };
-- 
cgit v1.2.3


From 10246dc84dfcabb05f2d29d442c9de967fd61270 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 22 Sep 2017 16:48:38 +0900
Subject: acpi nfit: nfit_test supports translate SPA

To test ndctl list which use interface of Translate SPA,
nfit_test needs to emulates it.
This test module searches region which includes SPA and
returns 1 dimm handle which is last one.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 104 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 1 deletion(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index bef419d4266d..3a3bad348d56 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -342,6 +342,84 @@ static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
 	return 0;
 }
 
+struct region_search_spa {
+	u64 addr;
+	struct nd_region *region;
+};
+
+static int is_region_device(struct device *dev)
+{
+	return !strncmp(dev->kobj.name, "region", 6);
+}
+
+static int nfit_test_search_region_spa(struct device *dev, void *data)
+{
+	struct region_search_spa *ctx = data;
+	struct nd_region *nd_region;
+	resource_size_t ndr_end;
+
+	if (!is_region_device(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	ndr_end = nd_region->ndr_start + nd_region->ndr_size;
+
+	if (ctx->addr >= nd_region->ndr_start && ctx->addr < ndr_end) {
+		ctx->region = nd_region;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int nfit_test_search_spa(struct nvdimm_bus *bus,
+		struct nd_cmd_translate_spa *spa)
+{
+	int ret;
+	struct nd_region *nd_region = NULL;
+	struct nvdimm *nvdimm = NULL;
+	struct nd_mapping *nd_mapping = NULL;
+	struct region_search_spa ctx = {
+		.addr = spa->spa,
+		.region = NULL,
+	};
+	u64 dpa;
+
+	ret = device_for_each_child(&bus->dev, &ctx,
+				nfit_test_search_region_spa);
+
+	if (!ret)
+		return -ENODEV;
+
+	nd_region = ctx.region;
+
+	dpa = ctx.addr - nd_region->ndr_start;
+
+	/*
+	 * last dimm is selected for test
+	 */
+	nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1];
+	nvdimm = nd_mapping->nvdimm;
+
+	spa->devices[0].nfit_device_handle = handle[nvdimm->id];
+	spa->num_nvdimms = 1;
+	spa->devices[0].dpa = dpa;
+
+	return 0;
+}
+
+static int nfit_test_cmd_translate_spa(struct nvdimm_bus *bus,
+		struct nd_cmd_translate_spa *spa, unsigned int buf_len)
+{
+	if (buf_len < spa->translate_length)
+		return -EINVAL;
+
+	if (nfit_test_search_spa(bus, spa) < 0 || !spa->num_nvdimms)
+		spa->status = 2;
+
+	return 0;
+}
+
 static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len)
 {
 	static const struct nd_smart_payload smart_data = {
@@ -449,6 +527,26 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		}
 	} else {
 		struct ars_state *ars_state = &t->ars_state;
+		struct nd_cmd_pkg *call_pkg = buf;
+
+		if (!nd_desc)
+			return -ENOTTY;
+
+		if (cmd == ND_CMD_CALL) {
+			func = call_pkg->nd_command;
+
+			buf_len = call_pkg->nd_size_in + call_pkg->nd_size_out;
+			buf = (void *) call_pkg->nd_payload;
+
+			switch (func) {
+			case NFIT_CMD_TRANSLATE_SPA:
+				rc = nfit_test_cmd_translate_spa(
+					acpi_desc->nvdimm_bus, buf, buf_len);
+				return rc;
+			default:
+				return -ENOTTY;
+			}
+		}
 
 		if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
 			return -ENOTTY;
@@ -1430,7 +1528,9 @@ static void nfit_test0_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
 	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_CALL, &acpi_desc->bus_cmd_force_en);
 	set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en);
+	set_bit(NFIT_CMD_TRANSLATE_SPA, &acpi_desc->bus_nfit_cmd_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1613,10 +1713,12 @@ static int nfit_ctl_test(struct device *dev)
 			.cmd_mask = 1UL << ND_CMD_ARS_CAP
 				| 1UL << ND_CMD_ARS_START
 				| 1UL << ND_CMD_ARS_STATUS
-				| 1UL << ND_CMD_CLEAR_ERROR,
+				| 1UL << ND_CMD_CLEAR_ERROR
+				| 1UL << ND_CMD_CALL,
 			.module = THIS_MODULE,
 			.provider_name = "ACPI.NFIT",
 			.ndctl = acpi_nfit_ctl,
+			.bus_dsm_mask = 1UL << NFIT_CMD_TRANSLATE_SPA,
 		},
 		.dev = &adev->dev,
 	};
-- 
cgit v1.2.3


From 4b27db7e26cdb9deb4a211e963e2993fde0110cb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 24 Sep 2017 09:57:34 -0700
Subject: acpi, nfit: add support for the _LSI, _LSR, and _LSW label methods

ACPI 6.2 adds support for named methods to access the label storage area
of an NVDIMM. We prefer these new methods if available and otherwise
fallback to the NVDIMM_FAMILY_INTEL _DSMs. The kernel ioctls,
ND_IOCTL_{GET,SET}_CONFIG_{SIZE,DATA}, remain generic and the driver
translates the 'package' payloads into the NVDIMM_FAMILY_INTEL 'buffer'
format to maintain compatibility with existing userspace and keep the
output buffer parsing code in the driver common.

The output payloads are mostly compatible save for the 'label area
locked' status that moves from the 'config_size' (_LSI) command to the
'config_read' (_LSR) command status.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/acpi/nfit/nfit.h |   3 +
 drivers/nvdimm/dimm.c    |   2 +
 3 files changed, 214 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 3369bb8fd9ba..ebe0857ac346 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -183,13 +183,33 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status)
 	return 0;
 }
 
-static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status)
+#define ACPI_LABELS_LOCKED 3
+
+static int xlat_nvdimm_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
+		u32 status)
 {
+	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
 	switch (cmd) {
 	case ND_CMD_GET_CONFIG_SIZE:
+		/*
+		 * In the _LSI, _LSR, _LSW case the locked status is
+		 * communicated via the read/write commands
+		 */
+		if (nfit_mem->has_lsi)
+			break;
+
 		if (status >> 16 & ND_CONFIG_LOCKED)
 			return -EACCES;
 		break;
+	case ND_CMD_GET_CONFIG_DATA:
+		if (nfit_mem->has_lsr && status == ACPI_LABELS_LOCKED)
+			return -EACCES;
+		break;
+	case ND_CMD_SET_CONFIG_DATA:
+		if (nfit_mem->has_lsw && status == ACPI_LABELS_LOCKED)
+			return -EACCES;
+		break;
 	default:
 		break;
 	}
@@ -205,13 +225,156 @@ static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
 {
 	if (!nvdimm)
 		return xlat_bus_status(buf, cmd, status);
-	return xlat_nvdimm_status(buf, cmd, status);
+	return xlat_nvdimm_status(nvdimm, buf, cmd, status);
+}
+
+/* convert _LS{I,R} packages to the buffer object acpi_nfit_ctl expects */
+static union acpi_object *pkg_to_buf(union acpi_object *pkg)
+{
+	int i;
+	void *dst;
+	size_t size = 0;
+	union acpi_object *buf = NULL;
+
+	if (pkg->type != ACPI_TYPE_PACKAGE) {
+		WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
+				pkg->type);
+		goto err;
+	}
+
+	for (i = 0; i < pkg->package.count; i++) {
+		union acpi_object *obj = &pkg->package.elements[i];
+
+		if (obj->type == ACPI_TYPE_INTEGER)
+			size += 4;
+		else if (obj->type == ACPI_TYPE_BUFFER)
+			size += obj->buffer.length;
+		else {
+			WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
+					obj->type);
+			goto err;
+		}
+	}
+
+	buf = ACPI_ALLOCATE(sizeof(*buf) + size);
+	if (!buf)
+		goto err;
+
+	dst = buf + 1;
+	buf->type = ACPI_TYPE_BUFFER;
+	buf->buffer.length = size;
+	buf->buffer.pointer = dst;
+	for (i = 0; i < pkg->package.count; i++) {
+		union acpi_object *obj = &pkg->package.elements[i];
+
+		if (obj->type == ACPI_TYPE_INTEGER) {
+			memcpy(dst, &obj->integer.value, 4);
+			dst += 4;
+		} else if (obj->type == ACPI_TYPE_BUFFER) {
+			memcpy(dst, obj->buffer.pointer, obj->buffer.length);
+			dst += obj->buffer.length;
+		}
+	}
+err:
+	ACPI_FREE(pkg);
+	return buf;
+}
+
+static union acpi_object *int_to_buf(union acpi_object *integer)
+{
+	union acpi_object *buf = ACPI_ALLOCATE(sizeof(*buf) + 4);
+	void *dst = NULL;
+
+	if (!buf)
+		goto err;
+
+	if (integer->type != ACPI_TYPE_INTEGER) {
+		WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
+				integer->type);
+		goto err;
+	}
+
+	dst = buf + 1;
+	buf->type = ACPI_TYPE_BUFFER;
+	buf->buffer.length = 4;
+	buf->buffer.pointer = dst;
+	memcpy(dst, &integer->integer.value, 4);
+err:
+	ACPI_FREE(integer);
+	return buf;
+}
+
+static union acpi_object *acpi_label_write(acpi_handle handle, u32 offset,
+		u32 len, void *data)
+{
+	acpi_status rc;
+	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_object_list input = {
+		.count = 3,
+		.pointer = (union acpi_object []) {
+			[0] = {
+				.integer.type = ACPI_TYPE_INTEGER,
+				.integer.value = offset,
+			},
+			[1] = {
+				.integer.type = ACPI_TYPE_INTEGER,
+				.integer.value = len,
+			},
+			[2] = {
+				.buffer.type = ACPI_TYPE_BUFFER,
+				.buffer.pointer = data,
+				.buffer.length = len,
+			},
+		},
+	};
+
+	rc = acpi_evaluate_object(handle, "_LSW", &input, &buf);
+	if (ACPI_FAILURE(rc))
+		return NULL;
+	return int_to_buf(buf.pointer);
+}
+
+static union acpi_object *acpi_label_read(acpi_handle handle, u32 offset,
+		u32 len)
+{
+	acpi_status rc;
+	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_object_list input = {
+		.count = 2,
+		.pointer = (union acpi_object []) {
+			[0] = {
+				.integer.type = ACPI_TYPE_INTEGER,
+				.integer.value = offset,
+			},
+			[1] = {
+				.integer.type = ACPI_TYPE_INTEGER,
+				.integer.value = len,
+			},
+		},
+	};
+
+	rc = acpi_evaluate_object(handle, "_LSR", &input, &buf);
+	if (ACPI_FAILURE(rc))
+		return NULL;
+	return pkg_to_buf(buf.pointer);
+}
+
+static union acpi_object *acpi_label_info(acpi_handle handle)
+{
+	acpi_status rc;
+	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+
+	rc = acpi_evaluate_object(handle, "_LSI", NULL, &buf);
+	if (ACPI_FAILURE(rc))
+		return NULL;
+	return pkg_to_buf(buf.pointer);
 }
 
 int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 		unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc)
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
+	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 	union acpi_object in_obj, in_buf, *out_obj;
 	const struct nd_cmd_desc *desc = NULL;
 	struct device *dev = acpi_desc->dev;
@@ -235,7 +398,6 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 	}
 
 	if (nvdimm) {
-		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 		struct acpi_device *adev = nfit_mem->adev;
 
 		if (!adev)
@@ -294,7 +456,21 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 			in_buf.buffer.pointer,
 			min_t(u32, 256, in_buf.buffer.length), true);
 
-	out_obj = acpi_evaluate_dsm(handle, guid, 1, func, &in_obj);
+	/* call the BIOS, prefer the named methods over _DSM if available */
+	if (cmd == ND_CMD_GET_CONFIG_SIZE && nfit_mem->has_lsi)
+		out_obj = acpi_label_info(handle);
+	else if (cmd == ND_CMD_GET_CONFIG_DATA && nfit_mem->has_lsr) {
+		struct nd_cmd_get_config_data_hdr *p = buf;
+
+		out_obj = acpi_label_read(handle, p->in_offset, p->in_length);
+	} else if (cmd == ND_CMD_SET_CONFIG_DATA && nfit_mem->has_lsw) {
+		struct nd_cmd_set_config_hdr *p = buf;
+
+		out_obj = acpi_label_write(handle, p->in_offset, p->in_length,
+				p->in_buf);
+	} else
+		out_obj = acpi_evaluate_dsm(handle, guid, 1, func, &in_obj);
+
 	if (!out_obj) {
 		dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
 				cmd_name);
@@ -1431,6 +1607,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 {
 	struct acpi_device *adev, *adev_dimm;
 	struct device *dev = acpi_desc->dev;
+	union acpi_object *obj;
 	unsigned long dsm_mask;
 	const guid_t *guid;
 	int i;
@@ -1496,6 +1673,27 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 		if (acpi_check_dsm(adev_dimm->handle, guid, 1, 1ULL << i))
 			set_bit(i, &nfit_mem->dsm_mask);
 
+	obj = acpi_label_info(adev_dimm->handle);
+	if (obj) {
+		ACPI_FREE(obj);
+		nfit_mem->has_lsi = 1;
+		dev_dbg(dev, "%s: has _LSI\n", dev_name(&adev_dimm->dev));
+	}
+
+	obj = acpi_label_read(adev_dimm->handle, 0, 0);
+	if (obj) {
+		ACPI_FREE(obj);
+		nfit_mem->has_lsr = 1;
+		dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev));
+	}
+
+	obj = acpi_label_write(adev_dimm->handle, 0, 0, NULL);
+	if (obj) {
+		ACPI_FREE(obj);
+		nfit_mem->has_lsw = 1;
+		dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev));
+	}
+
 	return 0;
 }
 
@@ -1574,6 +1772,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
 			cmd_mask |= nfit_mem->dsm_mask;
 
+		if (nfit_mem->has_lsi)
+			set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
+		if (nfit_mem->has_lsr)
+			set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
+		if (nfit_mem->has_lsw)
+			set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
+
 		flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
 			: NULL;
 		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index f2c6380bdbb2..3976d649f27c 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -140,6 +140,9 @@ struct nfit_mem {
 	struct resource *flush_wpq;
 	unsigned long dsm_mask;
 	int family;
+	u32 has_lsi:1;
+	u32 has_lsr:1;
+	u32 has_lsw:1;
 };
 
 struct acpi_nfit_desc {
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index 98466d762c8f..f8913b8124b6 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev)
 		goto err;
 
 	rc = nvdimm_init_config_data(ndd);
+	if (rc == -EACCES)
+		nvdimm_set_locked(dev);
 	if (rc)
 		goto err;
 
-- 
cgit v1.2.3


From efbf6f50ada168b68f5e3de474bc8dee4a02d046 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 25 Sep 2017 10:24:26 -0700
Subject: libnvdimm: introduce 'flags' attribute for DIMM 'lock' and 'alias'
 status

Given that we now how have two mechanisms for a DIMM to indicate that it
is locked:

    * NVDIMM_FAMILY_INTEL 'get_config_size' _DSM command

    * ACPI 6.2 Label Storage Read / Write commands

...export the generic libnvdimm DIMM status in a new 'flags' attribute.

This attribute can also reflect the 'alias' state which indicates
whether the nvdimm core is enforcing labels for aliased-region-capacity
that the given dimm is an interleave-set member.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/dimm_devs.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 5f1385b96b13..097794d9f786 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -331,6 +331,17 @@ static ssize_t commands_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(commands);
 
+static ssize_t flags_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	return sprintf(buf, "%s%s\n",
+			test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+			test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
+}
+static DEVICE_ATTR_RO(flags);
+
 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
@@ -372,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots);
 
 static struct attribute *nvdimm_attributes[] = {
 	&dev_attr_state.attr,
+	&dev_attr_flags.attr,
 	&dev_attr_commands.attr,
 	&dev_attr_available_slots.attr,
 	NULL,
-- 
cgit v1.2.3


From 65853a1da3c181f85d368daa40abb91025c368fc Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 5 Oct 2017 10:55:57 +0100
Subject: libnvdimm, namespace: make a couple of functions static

The functions create_namespace_pmem and create_namespace_blk are local
to the source and do not need to be in global scope, so make them static.

Cleans up sparse warnings:
symbol 'create_namespace_pmem' was not declared. Should it be static?
symbol 'create_namespace_blk' was not declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/namespace_devs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 0af988739a06..bb3ba8cf24d4 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
  * @nspm: target namespace to create
  * @nd_label: target pmem namespace label to evaluate
  */
-struct device *create_namespace_pmem(struct nd_region *nd_region,
+static struct device *create_namespace_pmem(struct nd_region *nd_region,
 		struct nd_namespace_index *nsindex,
 		struct nd_namespace_label *nd_label)
 {
@@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region,
 	return i;
 }
 
-struct device *create_namespace_blk(struct nd_region *nd_region,
+static struct device *create_namespace_blk(struct nd_region *nd_region,
 		struct nd_namespace_label *nd_label, int count)
 {
 
-- 
cgit v1.2.3


From 7fc9be3c136c140b02021c7973b91b20fb1981e5 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 18 Oct 2017 09:40:06 -0600
Subject: MAINTAINERS: Add entry for device DAX

And update the entry for filesystem DAX to differentiate them.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6671f375f7fc..456ba0725b8d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4162,7 +4162,7 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/busses/i2c-diolan-u2c.c
 
-DIRECT ACCESS (DAX)
+FILESYSTEM DIRECT ACCESS (DAX)
 M:	Matthew Wilcox <mawilcox@microsoft.com>
 M:	Ross Zwisler <ross.zwisler@linux.intel.com>
 L:	linux-fsdevel@vger.kernel.org
@@ -4171,6 +4171,12 @@ F:	fs/dax.c
 F:	include/linux/dax.h
 F:	include/trace/events/fs_dax.h
 
+DEVICE DIRECT ACCESS (DAX)
+M:	Dan Williams <dan.j.williams@intel.com>
+L:	linux-nvdimm@lists.01.org
+S:	Supported
+F:	drivers/dax/
+
 DIRECTORY NOTIFICATION (DNOTIFY)
 M:	Jan Kara <jack@suse.cz>
 R:	Amir Goldstein <amir73il@gmail.com>
-- 
cgit v1.2.3


From 0a3ff78699d1817e711441715d22665475466036 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 18 Oct 2017 12:21:55 -0600
Subject: dev/dax: fix uninitialized variable build warning

Fix this build warning:

warning: 'phys' may be used uninitialized in this function
[-Wuninitialized]

As reported here:

https://lkml.org/lkml/2017/10/16/152
http://kisskb.ellerman.id.au/kisskb/buildresult/13181373/log/

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index e9f3b3e4bbf4..6833ada237ab 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
 		unsigned long size)
 {
 	struct resource *res;
-	phys_addr_t phys;
+	/* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */
+	phys_addr_t uninitialized_var(phys);
 	int i;
 
 	for (i = 0; i < dev_dax->num_resources; i++) {
-- 
cgit v1.2.3


From b9b1504d3c6d6831c822cd785e095fc5fb155346 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 29 Oct 2017 12:13:07 -0700
Subject: acpi, nfit: hide unknown commands from nmemX/commands

For vendor specific commands that do not have a common kernel
translation, hide them from nmemX/commands. For example, the following
results from new enabling to probe for support of the new
NVDIMM_FAMILY_INTEL DSMs specified in v1.6 of the command specification
[1]:

    # cat /sys/bus/nd/devices/nmem0/commands
    smart smart_thresh flags get_size get_data set_data effect_size
    effect_log vendor cmd_call unknown unknown unknown unknown unknown
    unknown unknown unknown

[1]: https://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 10 ++++++++--
 drivers/acpi/nfit/nfit.h |  6 ++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ebe0857ac346..444832b372ec 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1769,8 +1769,14 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		 * userspace interface.
 		 */
 		cmd_mask = 1UL << ND_CMD_CALL;
-		if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
-			cmd_mask |= nfit_mem->dsm_mask;
+		if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
+			/*
+			 * These commands have a 1:1 correspondence
+			 * between DSM payload and libnvdimm ioctl
+			 * payload format.
+			 */
+			cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
+		}
 
 		if (nfit_mem->has_lsi)
 			set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 3976d649f27c..b987196bf132 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -38,6 +38,12 @@
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
 		| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
 
+#define NVDIMM_STANDARD_CMDMASK \
+(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
+ | 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \
+ | 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \
+ | 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR)
+
 enum nfit_uuids {
 	/* for simplicity alias the uuid index with the family id */
 	NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
-- 
cgit v1.2.3


From 11e142701609546632ba1fda586252d391026a3f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 20 Oct 2017 15:39:43 -0700
Subject: acpi, nfit: add support for NVDIMM_FAMILY_INTEL v1.6 DSMs

Per v1.6 of the NVDIMM_FAMILY_INTEL command set [1] some of the new
commands require rev-id 2. In addition to enabling ND_CMD_CALL for these
new function numbers, add a lookup table for revision-ids by family
and function number.

[1]: http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 drivers/acpi/nfit/nfit.h | 26 +++++++++++++++++++++++++-
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 444832b372ec..aa75d67ceee9 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -370,6 +370,32 @@ static union acpi_object *acpi_label_info(acpi_handle handle)
 	return pkg_to_buf(buf.pointer);
 }
 
+static u8 nfit_dsm_revid(unsigned family, unsigned func)
+{
+	static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = {
+		[NVDIMM_FAMILY_INTEL] = {
+			[NVDIMM_INTEL_GET_MODES] = 2,
+			[NVDIMM_INTEL_GET_FWINFO] = 2,
+			[NVDIMM_INTEL_START_FWUPDATE] = 2,
+			[NVDIMM_INTEL_SEND_FWUPDATE] = 2,
+			[NVDIMM_INTEL_FINISH_FWUPDATE] = 2,
+			[NVDIMM_INTEL_QUERY_FWUPDATE] = 2,
+			[NVDIMM_INTEL_SET_THRESHOLD] = 2,
+			[NVDIMM_INTEL_INJECT_ERROR] = 2,
+		},
+	};
+	u8 id;
+
+	if (family > NVDIMM_FAMILY_MAX)
+		return 0;
+	if (func > 31)
+		return 0;
+	id = revid_table[family][func];
+	if (id == 0)
+		return 1; /* default */
+	return id;
+}
+
 int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 		unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc)
 {
@@ -468,8 +494,15 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 
 		out_obj = acpi_label_write(handle, p->in_offset, p->in_length,
 				p->in_buf);
-	} else
-		out_obj = acpi_evaluate_dsm(handle, guid, 1, func, &in_obj);
+	} else {
+		u8 revid;
+
+		if (nfit_mem)
+			revid = nfit_dsm_revid(nfit_mem->family, func);
+		else
+			revid = 1;
+		out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj);
+	}
 
 	if (!out_obj) {
 		dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
@@ -1640,7 +1673,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	 * different command sets.  Note, that checking for function0 (bit0)
 	 * tells us if any commands are reachable through this GUID.
 	 */
-	for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++)
+	for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
 		if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
 			if (family < 0 || i == default_dsm_family)
 				family = i;
@@ -1650,7 +1683,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	if (override_dsm_mask && !disable_vendor_specific)
 		dsm_mask = override_dsm_mask;
 	else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
-		dsm_mask = 0x3fe;
+		dsm_mask = NVDIMM_INTEL_CMDMASK;
 		if (disable_vendor_specific)
 			dsm_mask &= ~(1 << ND_CMD_VENDOR);
 	} else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) {
@@ -1670,7 +1703,9 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 
 	guid = to_nfit_uuid(nfit_mem->family);
 	for_each_set_bit(i, &dsm_mask, BITS_PER_LONG)
-		if (acpi_check_dsm(adev_dimm->handle, guid, 1, 1ULL << i))
+		if (acpi_check_dsm(adev_dimm->handle, guid,
+					nfit_dsm_revid(nfit_mem->family, i),
+					1ULL << i))
 			set_bit(i, &nfit_mem->dsm_mask);
 
 	obj = acpi_label_info(adev_dimm->handle);
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index b987196bf132..341be9511d0e 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -24,7 +24,7 @@
 /* ACPI 6.1 */
 #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
 
-/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */
+/* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */
 #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
 
 /* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */
@@ -38,12 +38,36 @@
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
 		| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
 
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
+
 #define NVDIMM_STANDARD_CMDMASK \
 (1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
  | 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \
  | 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \
  | 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR)
 
+/*
+ * Command numbers that the kernel needs to know about to handle
+ * non-default DSM revision ids
+ */
+enum nvdimm_family_cmds {
+	NVDIMM_INTEL_GET_MODES = 11,
+	NVDIMM_INTEL_GET_FWINFO = 12,
+	NVDIMM_INTEL_START_FWUPDATE = 13,
+	NVDIMM_INTEL_SEND_FWUPDATE = 14,
+	NVDIMM_INTEL_FINISH_FWUPDATE = 15,
+	NVDIMM_INTEL_QUERY_FWUPDATE = 16,
+	NVDIMM_INTEL_SET_THRESHOLD = 17,
+	NVDIMM_INTEL_INJECT_ERROR = 18,
+};
+
+#define NVDIMM_INTEL_CMDMASK \
+(NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \
+ | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \
+ | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \
+ | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \
+ | 1 << NVDIMM_INTEL_INJECT_ERROR)
+
 enum nfit_uuids {
 	/* for simplicity alias the uuid index with the family id */
 	NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
-- 
cgit v1.2.3


From aa9ad44a42b4cf4387f8ecddaf8e51707fdcda5a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Aug 2017 12:48:26 -0700
Subject: libnvdimm: move poison list functions to a new 'badrange' file

nfit_test needs to use the poison list manipulation code as well. Make
it more generic and in the process rename poison to badrange, and move
all the related helpers to a new file.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
[vishal: Add badrange.o to nfit_test's Kbuild]
[vishal: add a missed include in bus.c for the new badrange functions]
[vishal: rename all instances of 'be' to 'bre']
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c    |   2 +-
 drivers/acpi/nfit/mce.c     |   2 +-
 drivers/nvdimm/Makefile     |   1 +
 drivers/nvdimm/badrange.c   | 294 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/bus.c        |  24 ++--
 drivers/nvdimm/core.c       | 260 +--------------------------------------
 drivers/nvdimm/nd-core.h    |   3 +-
 drivers/nvdimm/nd.h         |   6 -
 include/linux/libnvdimm.h   |  21 +++-
 tools/testing/nvdimm/Kbuild |   1 +
 10 files changed, 332 insertions(+), 282 deletions(-)
 create mode 100644 drivers/nvdimm/badrange.c

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index aa75d67ceee9..8043bfde7c63 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2486,7 +2486,7 @@ static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc,
 		if (ars_status->out_length
 				< 44 + sizeof(struct nd_ars_record) * (i + 1))
 			break;
-		rc = nvdimm_bus_add_poison(nvdimm_bus,
+		rc = nvdimm_bus_add_badrange(nvdimm_bus,
 				ars_status->records[i].err_address,
 				ars_status->records[i].length);
 		if (rc)
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index feeb95d574fa..b92921439657 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 			continue;
 
 		/* If this fails due to an -ENOMEM, there is little we can do */
-		nvdimm_bus_add_poison(acpi_desc->nvdimm_bus,
+		nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
 				ALIGN(mce->addr, L1_CACHE_BYTES),
 				L1_CACHE_BYTES);
 		nvdimm_region_notify(nfit_spa->nd_region,
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 909554c3f955..ca6d325438a3 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -20,6 +20,7 @@ libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
 libnvdimm-y += namespace_devs.o
 libnvdimm-y += label.o
+libnvdimm-y += badrange.o
 libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
 libnvdimm-$(CONFIG_BTT) += btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
new file mode 100644
index 000000000000..0b67dcf43234
--- /dev/null
+++ b/drivers/nvdimm/badrange.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+void badrange_init(struct badrange *badrange)
+{
+	INIT_LIST_HEAD(&badrange->list);
+	spin_lock_init(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_init);
+
+static void append_badrange_entry(struct badrange *badrange,
+		struct badrange_entry *bre, u64 addr, u64 length)
+{
+	lockdep_assert_held(&badrange->lock);
+	bre->start = addr;
+	bre->length = length;
+	list_add_tail(&bre->list, &badrange->list);
+}
+
+static int alloc_and_append_badrange_entry(struct badrange *badrange,
+		u64 addr, u64 length, gfp_t flags)
+{
+	struct badrange_entry *bre;
+
+	bre = kzalloc(sizeof(*bre), flags);
+	if (!bre)
+		return -ENOMEM;
+
+	append_badrange_entry(badrange, bre, addr, length);
+	return 0;
+}
+
+static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
+{
+	struct badrange_entry *bre, *bre_new;
+
+	spin_unlock(&badrange->lock);
+	bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
+	spin_lock(&badrange->lock);
+
+	if (list_empty(&badrange->list)) {
+		if (!bre_new)
+			return -ENOMEM;
+		append_badrange_entry(badrange, bre_new, addr, length);
+		return 0;
+	}
+
+	/*
+	 * There is a chance this is a duplicate, check for those first.
+	 * This will be the common case as ARS_STATUS returns all known
+	 * errors in the SPA space, and we can't query it per region
+	 */
+	list_for_each_entry(bre, &badrange->list, list)
+		if (bre->start == addr) {
+			/* If length has changed, update this list entry */
+			if (bre->length != length)
+				bre->length = length;
+			kfree(bre_new);
+			return 0;
+		}
+
+	/*
+	 * If not a duplicate or a simple length update, add the entry as is,
+	 * as any overlapping ranges will get resolved when the list is consumed
+	 * and converted to badblocks
+	 */
+	if (!bre_new)
+		return -ENOMEM;
+	append_badrange_entry(badrange, bre_new, addr, length);
+
+	return 0;
+}
+
+int badrange_add(struct badrange *badrange, u64 addr, u64 length)
+{
+	int rc;
+
+	spin_lock(&badrange->lock);
+	rc = add_badrange(badrange, addr, length);
+	spin_unlock(&badrange->lock);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(badrange_add);
+
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+		unsigned int len)
+{
+	struct list_head *badrange_list = &badrange->list;
+	u64 clr_end = start + len - 1;
+	struct badrange_entry *bre, *next;
+
+	spin_lock(&badrange->lock);
+	WARN_ON_ONCE(list_empty(badrange_list));
+
+	/*
+	 * [start, clr_end] is the badrange interval being cleared.
+	 * [bre->start, bre_end] is the badrange_list entry we're comparing
+	 * the above interval against. The badrange list entry may need
+	 * to be modified (update either start or length), deleted, or
+	 * split into two based on the overlap characteristics
+	 */
+
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Skip intervals with no intersection */
+		if (bre_end < start)
+			continue;
+		if (bre->start >  clr_end)
+			continue;
+		/* Delete completely overlapped badrange entries */
+		if ((bre->start >= start) && (bre_end <= clr_end)) {
+			list_del(&bre->list);
+			kfree(bre);
+			continue;
+		}
+		/* Adjust start point of partially cleared entries */
+		if ((start <= bre->start) && (clr_end > bre->start)) {
+			bre->length -= clr_end - bre->start + 1;
+			bre->start = clr_end + 1;
+			continue;
+		}
+		/* Adjust bre->length for partial clearing at the tail end */
+		if ((bre->start < start) && (bre_end <= clr_end)) {
+			/* bre->start remains the same */
+			bre->length = start - bre->start;
+			continue;
+		}
+		/*
+		 * If clearing in the middle of an entry, we split it into
+		 * two by modifying the current entry to represent one half of
+		 * the split, and adding a new entry for the second half.
+		 */
+		if ((bre->start < start) && (bre_end > clr_end)) {
+			u64 new_start = clr_end + 1;
+			u64 new_len = bre_end - new_start + 1;
+
+			/* Add new entry covering the right half */
+			alloc_and_append_badrange_entry(badrange, new_start,
+					new_len, GFP_NOWAIT);
+			/* Adjust this entry to cover the left half */
+			bre->length = start - bre->start;
+			continue;
+		}
+	}
+	spin_unlock(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_forget);
+
+static void set_badblock(struct badblocks *bb, sector_t s, int num)
+{
+	dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
+			(u64) s * 512, (u64) num * 512);
+	/* this isn't an error as the hardware will still throw an exception */
+	if (badblocks_set(bb, s, num, 1))
+		dev_info_once(bb->dev, "%s: failed for sector %llx\n",
+				__func__, (u64) s);
+}
+
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @bb:		badblocks instance to populate
+ * @ns_offset:	namespace offset where the error range begins (in bytes)
+ * @len:	number of bytes of badrange to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
+{
+	const unsigned int sector_size = 512;
+	sector_t start_sector, end_sector;
+	u64 num_sectors;
+	u32 rem;
+
+	start_sector = div_u64(ns_offset, sector_size);
+	end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
+	if (rem)
+		end_sector++;
+	num_sectors = end_sector - start_sector;
+
+	if (unlikely(num_sectors > (u64)INT_MAX)) {
+		u64 remaining = num_sectors;
+		sector_t s = start_sector;
+
+		while (remaining) {
+			int done = min_t(u64, remaining, INT_MAX);
+
+			set_badblock(bb, s, done);
+			remaining -= done;
+			s += done;
+		}
+	} else
+		set_badblock(bb, start_sector, num_sectors);
+}
+
+static void badblocks_populate(struct badrange *badrange,
+		struct badblocks *bb, const struct resource *res)
+{
+	struct badrange_entry *bre;
+
+	if (list_empty(&badrange->list))
+		return;
+
+	list_for_each_entry(bre, &badrange->list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Discard intervals with no intersection */
+		if (bre_end < res->start)
+			continue;
+		if (bre->start >  res->end)
+			continue;
+		/* Deal with any overlap after start of the namespace */
+		if (bre->start >= res->start) {
+			u64 start = bre->start;
+			u64 len;
+
+			if (bre_end <= res->end)
+				len = bre->length;
+			else
+				len = res->start + resource_size(res)
+					- bre->start;
+			__add_badblock_range(bb, start - res->start, len);
+			continue;
+		}
+		/*
+		 * Deal with overlap for badrange starting before
+		 * the namespace.
+		 */
+		if (bre->start < res->start) {
+			u64 len;
+
+			if (bre_end < res->end)
+				len = bre->start + bre->length - res->start;
+			else
+				len = resource_size(res);
+			__add_badblock_range(bb, 0, len);
+		}
+	}
+}
+
+/**
+ * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
+ * @region: parent region of the range to interrogate
+ * @bb: badblocks instance to populate
+ * @res: resource range to consider
+ *
+ * The badrange list generated during bus initialization may contain
+ * multiple, possibly overlapping physical address ranges.  Compare each
+ * of these ranges to the resource range currently being initialized,
+ * and add badblocks entries for all matching sub-ranges
+ */
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+		struct badblocks *bb, const struct resource *res)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	if (!is_memory(&nd_region->dev)) {
+		dev_WARN_ONCE(&nd_region->dev, 1,
+				"%s only valid for pmem regions\n", __func__);
+		return;
+	}
+	nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	badblocks_populate(&nvdimm_bus->badrange, bb, res);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index baf283986a7e..0a5e6cd758fe 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,6 +11,7 @@
  * General Public License for more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/libnvdimm.h>
 #include <linux/sched/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
@@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t phys, u64 cleared)
 {
 	if (cleared > 0)
-		nvdimm_forget_poison(nvdimm_bus, phys, cleared);
+		badrange_forget(&nvdimm_bus->badrange, phys, cleared);
 
 	if (cleared > 0 && cleared / 512)
 		nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
@@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		return NULL;
 	INIT_LIST_HEAD(&nvdimm_bus->list);
 	INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
-	INIT_LIST_HEAD(&nvdimm_bus->poison_list);
 	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
-	spin_lock_init(&nvdimm_bus->poison_lock);
+	badrange_init(&nvdimm_bus->badrange);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
 		return NULL;
@@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data)
 	return 0;
 }
 
-static void free_poison_list(struct list_head *poison_list)
+static void free_badrange_list(struct list_head *badrange_list)
 {
-	struct nd_poison *pl, *next;
+	struct badrange_entry *bre, *next;
 
-	list_for_each_entry_safe(pl, next, poison_list, list) {
-		list_del(&pl->list);
-		kfree(pl);
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		list_del(&bre->list);
+		kfree(bre);
 	}
-	list_del_init(poison_list);
+	list_del_init(badrange_list);
 }
 
 static int nd_bus_remove(struct device *dev)
@@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev)
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
 
-	spin_lock(&nvdimm_bus->poison_lock);
-	free_poison_list(&nvdimm_bus->poison_list);
-	spin_unlock(&nvdimm_bus->poison_lock);
+	spin_lock(&nvdimm_bus->badrange.lock);
+	free_badrange_list(&nvdimm_bus->badrange.list);
+	spin_unlock(&nvdimm_bus->badrange.lock);
 
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index bb71f0cf8f5d..1dc527660637 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
 
-static void set_badblock(struct badblocks *bb, sector_t s, int num)
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
-	dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
-			(u64) s * 512, (u64) num * 512);
-	/* this isn't an error as the hardware will still throw an exception */
-	if (badblocks_set(bb, s, num, 1))
-		dev_info_once(bb->dev, "%s: failed for sector %llx\n",
-				__func__, (u64) s);
+	return badrange_add(&nvdimm_bus->badrange, addr, length);
 }
-
-/**
- * __add_badblock_range() - Convert a physical address range to bad sectors
- * @bb:		badblocks instance to populate
- * @ns_offset:	namespace offset where the error range begins (in bytes)
- * @len:	number of bytes of poison to be added
- *
- * This assumes that the range provided with (ns_offset, len) is within
- * the bounds of physical addresses for this namespace, i.e. lies in the
- * interval [ns_start, ns_start + ns_size)
- */
-static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
-{
-	const unsigned int sector_size = 512;
-	sector_t start_sector, end_sector;
-	u64 num_sectors;
-	u32 rem;
-
-	start_sector = div_u64(ns_offset, sector_size);
-	end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
-	if (rem)
-		end_sector++;
-	num_sectors = end_sector - start_sector;
-
-	if (unlikely(num_sectors > (u64)INT_MAX)) {
-		u64 remaining = num_sectors;
-		sector_t s = start_sector;
-
-		while (remaining) {
-			int done = min_t(u64, remaining, INT_MAX);
-
-			set_badblock(bb, s, done);
-			remaining -= done;
-			s += done;
-		}
-	} else
-		set_badblock(bb, start_sector, num_sectors);
-}
-
-static void badblocks_populate(struct list_head *poison_list,
-		struct badblocks *bb, const struct resource *res)
-{
-	struct nd_poison *pl;
-
-	if (list_empty(poison_list))
-		return;
-
-	list_for_each_entry(pl, poison_list, list) {
-		u64 pl_end = pl->start + pl->length - 1;
-
-		/* Discard intervals with no intersection */
-		if (pl_end < res->start)
-			continue;
-		if (pl->start >  res->end)
-			continue;
-		/* Deal with any overlap after start of the namespace */
-		if (pl->start >= res->start) {
-			u64 start = pl->start;
-			u64 len;
-
-			if (pl_end <= res->end)
-				len = pl->length;
-			else
-				len = res->start + resource_size(res)
-					- pl->start;
-			__add_badblock_range(bb, start - res->start, len);
-			continue;
-		}
-		/* Deal with overlap for poison starting before the namespace */
-		if (pl->start < res->start) {
-			u64 len;
-
-			if (pl_end < res->end)
-				len = pl->start + pl->length - res->start;
-			else
-				len = resource_size(res);
-			__add_badblock_range(bb, 0, len);
-		}
-	}
-}
-
-/**
- * nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks
- * @region: parent region of the range to interrogate
- * @bb: badblocks instance to populate
- * @res: resource range to consider
- *
- * The poison list generated during bus initialization may contain
- * multiple, possibly overlapping physical address ranges.  Compare each
- * of these ranges to the resource range currently being initialized,
- * and add badblocks entries for all matching sub-ranges
- */
-void nvdimm_badblocks_populate(struct nd_region *nd_region,
-		struct badblocks *bb, const struct resource *res)
-{
-	struct nvdimm_bus *nvdimm_bus;
-	struct list_head *poison_list;
-
-	if (!is_memory(&nd_region->dev)) {
-		dev_WARN_ONCE(&nd_region->dev, 1,
-				"%s only valid for pmem regions\n", __func__);
-		return;
-	}
-	nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-	poison_list = &nvdimm_bus->poison_list;
-
-	nvdimm_bus_lock(&nvdimm_bus->dev);
-	badblocks_populate(poison_list, bb, res);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
-}
-EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
-
-static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
-		struct nd_poison *pl, u64 addr, u64 length)
-{
-	lockdep_assert_held(&nvdimm_bus->poison_lock);
-	pl->start = addr;
-	pl->length = length;
-	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
-}
-
-static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
-			gfp_t flags)
-{
-	struct nd_poison *pl;
-
-	pl = kzalloc(sizeof(*pl), flags);
-	if (!pl)
-		return -ENOMEM;
-
-	append_poison_entry(nvdimm_bus, pl, addr, length);
-	return 0;
-}
-
-static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-	struct nd_poison *pl, *pl_new;
-
-	spin_unlock(&nvdimm_bus->poison_lock);
-	pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
-	spin_lock(&nvdimm_bus->poison_lock);
-
-	if (list_empty(&nvdimm_bus->poison_list)) {
-		if (!pl_new)
-			return -ENOMEM;
-		append_poison_entry(nvdimm_bus, pl_new, addr, length);
-		return 0;
-	}
-
-	/*
-	 * There is a chance this is a duplicate, check for those first.
-	 * This will be the common case as ARS_STATUS returns all known
-	 * errors in the SPA space, and we can't query it per region
-	 */
-	list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
-		if (pl->start == addr) {
-			/* If length has changed, update this list entry */
-			if (pl->length != length)
-				pl->length = length;
-			kfree(pl_new);
-			return 0;
-		}
-
-	/*
-	 * If not a duplicate or a simple length update, add the entry as is,
-	 * as any overlapping ranges will get resolved when the list is consumed
-	 * and converted to badblocks
-	 */
-	if (!pl_new)
-		return -ENOMEM;
-	append_poison_entry(nvdimm_bus, pl_new, addr, length);
-
-	return 0;
-}
-
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-	int rc;
-
-	spin_lock(&nvdimm_bus->poison_lock);
-	rc = bus_add_poison(nvdimm_bus, addr, length);
-	spin_unlock(&nvdimm_bus->poison_lock);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
-
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
-		unsigned int len)
-{
-	struct list_head *poison_list = &nvdimm_bus->poison_list;
-	u64 clr_end = start + len - 1;
-	struct nd_poison *pl, *next;
-
-	spin_lock(&nvdimm_bus->poison_lock);
-	WARN_ON_ONCE(list_empty(poison_list));
-
-	/*
-	 * [start, clr_end] is the poison interval being cleared.
-	 * [pl->start, pl_end] is the poison_list entry we're comparing
-	 * the above interval against. The poison list entry may need
-	 * to be modified (update either start or length), deleted, or
-	 * split into two based on the overlap characteristics
-	 */
-
-	list_for_each_entry_safe(pl, next, poison_list, list) {
-		u64 pl_end = pl->start + pl->length - 1;
-
-		/* Skip intervals with no intersection */
-		if (pl_end < start)
-			continue;
-		if (pl->start >  clr_end)
-			continue;
-		/* Delete completely overlapped poison entries */
-		if ((pl->start >= start) && (pl_end <= clr_end)) {
-			list_del(&pl->list);
-			kfree(pl);
-			continue;
-		}
-		/* Adjust start point of partially cleared entries */
-		if ((start <= pl->start) && (clr_end > pl->start)) {
-			pl->length -= clr_end - pl->start + 1;
-			pl->start = clr_end + 1;
-			continue;
-		}
-		/* Adjust pl->length for partial clearing at the tail end */
-		if ((pl->start < start) && (pl_end <= clr_end)) {
-			/* pl->start remains the same */
-			pl->length = start - pl->start;
-			continue;
-		}
-		/*
-		 * If clearing in the middle of an entry, we split it into
-		 * two by modifying the current entry to represent one half of
-		 * the split, and adding a new entry for the second half.
-		 */
-		if ((pl->start < start) && (pl_end > clr_end)) {
-			u64 new_start = clr_end + 1;
-			u64 new_len = pl_end - new_start + 1;
-
-			/* Add new entry covering the right half */
-			add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
-			/* Adjust this entry to cover the left half */
-			pl->length = start - pl->start;
-			continue;
-		}
-	}
-	spin_unlock(&nvdimm_bus->poison_lock);
-}
-EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 86bc19ae30da..79274ead54fb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -29,10 +29,9 @@ struct nvdimm_bus {
 	struct list_head list;
 	struct device dev;
 	int id, probe_active;
-	struct list_head poison_list;
 	struct list_head mapping_list;
 	struct mutex reconfig_mutex;
-	spinlock_t poison_lock;
+	struct badrange badrange;
 };
 
 struct nvdimm {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 156be00e1f76..e958f3724c41 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -34,12 +34,6 @@ enum {
 	NVDIMM_IO_ATOMIC = 1,
 };
 
-struct nd_poison {
-	u64 start;
-	u64 length;
-	struct list_head list;
-};
-
 struct nvdimm_drvdata {
 	struct device *dev;
 	int nslabel_size;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 3eaad2fbf284..f8109ddb5ef1 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -18,6 +18,18 @@
 #include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
+#include <linux/spinlock.h>
+
+struct badrange_entry {
+	u64 start;
+	u64 length;
+	struct list_head list;
+};
+
+struct badrange {
+	struct list_head list;
+	spinlock_t lock;
+};
 
 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
@@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
 
 }
 
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len);
+void badrange_init(struct badrange *badrange);
+int badrange_add(struct badrange *badrange, u64 addr, u64 length);
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+		unsigned int len);
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr,
+		u64 length);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index d870520da68b..3272ab5a5cb9 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -69,6 +69,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/region.o
 libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/label.o
+libnvdimm-y += $(NVDIMM_SRC)/badrange.o
 libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
 libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
-- 
cgit v1.2.3


From 9fb1a1903345fea598f48277576a3589a972b72e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 30 Oct 2017 13:22:20 -0700
Subject: nfit_test: add error injection DSMs

Add nfit_test emulation for the new ACPI 6.2 error injectino DSMs.
This will allow unit tests to selectively inject the errors they wish to
test for.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
[vishal: Move injection functions to ND_CMD_CALL]
[vishal: Add support for the notification option]
[vishal: move an nfit_test private definition into a local header]
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c      | 187 +++++++++++++++++++++++++++++-----
 tools/testing/nvdimm/test/nfit_test.h |   5 +
 2 files changed, 168 insertions(+), 24 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 3a3bad348d56..788b1a5d6c5a 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -168,8 +168,12 @@ struct nfit_test {
 		spinlock_t lock;
 	} ars_state;
 	struct device *dimm_dev[NUM_DCR];
+	struct badrange badrange;
+	struct work_struct work;
 };
 
+static struct workqueue_struct *nfit_wq;
+
 static struct nfit_test *to_nfit_test(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
@@ -234,48 +238,68 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
 	return rc;
 }
 
-#define NFIT_TEST_ARS_RECORDS 4
 #define NFIT_TEST_CLEAR_ERR_UNIT 256
 
 static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
 		unsigned int buf_len)
 {
+	int ars_recs;
+
 	if (buf_len < sizeof(*nd_cmd))
 		return -EINVAL;
 
+	/* for testing, only store up to n records that fit within 4k */
+	ars_recs = SZ_4K / sizeof(struct nd_ars_record);
+
 	nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status)
-		+ NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record);
+		+ ars_recs * sizeof(struct nd_ars_record);
 	nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
 	nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT;
 
 	return 0;
 }
 
-/*
- * Initialize the ars_state to return an ars_result 1 second in the future with
- * a 4K error range in the middle of the requested address range.
- */
-static void post_ars_status(struct ars_state *ars_state, u64 addr, u64 len)
+static void post_ars_status(struct ars_state *ars_state,
+		struct badrange *badrange, u64 addr, u64 len)
 {
 	struct nd_cmd_ars_status *ars_status;
 	struct nd_ars_record *ars_record;
+	struct badrange_entry *be;
+	u64 end = addr + len - 1;
+	int i = 0;
 
 	ars_state->deadline = jiffies + 1*HZ;
 	ars_status = ars_state->ars_status;
 	ars_status->status = 0;
-	ars_status->out_length = sizeof(struct nd_cmd_ars_status)
-		+ sizeof(struct nd_ars_record);
 	ars_status->address = addr;
 	ars_status->length = len;
 	ars_status->type = ND_ARS_PERSISTENT;
-	ars_status->num_records = 1;
-	ars_record = &ars_status->records[0];
-	ars_record->handle = 0;
-	ars_record->err_address = addr + len / 2;
-	ars_record->length = SZ_4K;
+
+	spin_lock(&badrange->lock);
+	list_for_each_entry(be, &badrange->list, list) {
+		u64 be_end = be->start + be->length - 1;
+		u64 rstart, rend;
+
+		/* skip entries outside the range */
+		if (be_end < addr || be->start > end)
+			continue;
+
+		rstart = (be->start < addr) ? addr : be->start;
+		rend = (be_end < end) ? be_end : end;
+		ars_record = &ars_status->records[i];
+		ars_record->handle = 0;
+		ars_record->err_address = rstart;
+		ars_record->length = rend - rstart + 1;
+		i++;
+	}
+	spin_unlock(&badrange->lock);
+	ars_status->num_records = i;
+	ars_status->out_length = sizeof(struct nd_cmd_ars_status)
+		+ i * sizeof(struct nd_ars_record);
 }
 
-static int nfit_test_cmd_ars_start(struct ars_state *ars_state,
+static int nfit_test_cmd_ars_start(struct nfit_test *t,
+		struct ars_state *ars_state,
 		struct nd_cmd_ars_start *ars_start, unsigned int buf_len,
 		int *cmd_rc)
 {
@@ -289,7 +313,7 @@ static int nfit_test_cmd_ars_start(struct ars_state *ars_state,
 	} else {
 		ars_start->status = 0;
 		ars_start->scrub_time = 1;
-		post_ars_status(ars_state, ars_start->address,
+		post_ars_status(ars_state, &t->badrange, ars_start->address,
 				ars_start->length);
 		*cmd_rc = 0;
 	}
@@ -456,6 +480,93 @@ static int nfit_test_cmd_smart_threshold(struct nd_cmd_smart_threshold *smart_t,
 	return 0;
 }
 
+static void uc_error_notify(struct work_struct *work)
+{
+	struct nfit_test *t = container_of(work, typeof(*t), work);
+
+	__acpi_nfit_notify(&t->pdev.dev, t, NFIT_NOTIFY_UC_MEMORY_ERROR);
+}
+
+static int nfit_test_cmd_ars_error_inject(struct nfit_test *t,
+		struct nd_cmd_ars_err_inj *err_inj, unsigned int buf_len)
+{
+	int rc;
+
+	if (buf_len < sizeof(*err_inj)) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	if (err_inj->err_inj_spa_range_length <= 0) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	rc =  badrange_add(&t->badrange, err_inj->err_inj_spa_range_base,
+			err_inj->err_inj_spa_range_length);
+	if (rc < 0)
+		goto err;
+
+	if (err_inj->err_inj_options & (1 << ND_ARS_ERR_INJ_OPT_NOTIFY))
+		queue_work(nfit_wq, &t->work);
+
+	err_inj->status = 0;
+	return 0;
+
+err:
+	err_inj->status = NFIT_ARS_INJECT_INVALID;
+	return rc;
+}
+
+static int nfit_test_cmd_ars_inject_clear(struct nfit_test *t,
+		struct nd_cmd_ars_err_inj_clr *err_clr, unsigned int buf_len)
+{
+	int rc;
+
+	if (buf_len < sizeof(*err_clr)) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	if (err_clr->err_inj_clr_spa_range_length <= 0) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	badrange_forget(&t->badrange, err_clr->err_inj_clr_spa_range_base,
+			err_clr->err_inj_clr_spa_range_length);
+
+	err_clr->status = 0;
+	return 0;
+
+err:
+	err_clr->status = NFIT_ARS_INJECT_INVALID;
+	return rc;
+}
+
+static int nfit_test_cmd_ars_inject_status(struct nfit_test *t,
+		struct nd_cmd_ars_err_inj_stat *err_stat,
+		unsigned int buf_len)
+{
+	struct badrange_entry *be;
+	int max = SZ_4K / sizeof(struct nd_error_stat_query_record);
+	int i = 0;
+
+	err_stat->status = 0;
+	spin_lock(&t->badrange.lock);
+	list_for_each_entry(be, &t->badrange.list, list) {
+		err_stat->record[i].err_inj_stat_spa_range_base = be->start;
+		err_stat->record[i].err_inj_stat_spa_range_length = be->length;
+		i++;
+		if (i > max)
+			break;
+	}
+	spin_unlock(&t->badrange.lock);
+	err_stat->inj_err_rec_count = i;
+
+	return 0;
+}
+
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc)
@@ -543,6 +654,18 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 				rc = nfit_test_cmd_translate_spa(
 					acpi_desc->nvdimm_bus, buf, buf_len);
 				return rc;
+			case NFIT_CMD_ARS_INJECT_SET:
+				rc = nfit_test_cmd_ars_error_inject(t, buf,
+					buf_len);
+				return rc;
+			case NFIT_CMD_ARS_INJECT_CLEAR:
+				rc = nfit_test_cmd_ars_inject_clear(t, buf,
+					buf_len);
+				return rc;
+			case NFIT_CMD_ARS_INJECT_GET:
+				rc = nfit_test_cmd_ars_inject_status(t, buf,
+					buf_len);
+				return rc;
 			default:
 				return -ENOTTY;
 			}
@@ -556,8 +679,8 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			rc = nfit_test_cmd_ars_cap(buf, buf_len);
 			break;
 		case ND_CMD_ARS_START:
-			rc = nfit_test_cmd_ars_start(ars_state, buf, buf_len,
-					cmd_rc);
+			rc = nfit_test_cmd_ars_start(t, ars_state, buf,
+					buf_len, cmd_rc);
 			break;
 		case ND_CMD_ARS_STATUS:
 			rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len,
@@ -664,10 +787,9 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
 
 static int ars_state_init(struct device *dev, struct ars_state *ars_state)
 {
+	/* for testing, only store up to n records that fit within 4k */
 	ars_state->ars_status = devm_kzalloc(dev,
-			sizeof(struct nd_cmd_ars_status)
-			+ sizeof(struct nd_ars_record) * NFIT_TEST_ARS_RECORDS,
-			GFP_KERNEL);
+			sizeof(struct nd_cmd_ars_status) + SZ_4K, GFP_KERNEL);
 	if (!ars_state->ars_status)
 		return -ENOMEM;
 	spin_lock_init(&ars_state->lock);
@@ -1517,7 +1639,8 @@ static void nfit_test0_setup(struct nfit_test *t)
 				+ i * sizeof(u64);
 	}
 
-	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE);
+	post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
+			SPA0_SIZE);
 
 	acpi_desc = &t->acpi_desc;
 	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
@@ -1531,6 +1654,9 @@ static void nfit_test0_setup(struct nfit_test *t)
 	set_bit(ND_CMD_CALL, &acpi_desc->bus_cmd_force_en);
 	set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en);
 	set_bit(NFIT_CMD_TRANSLATE_SPA, &acpi_desc->bus_nfit_cmd_force_en);
+	set_bit(NFIT_CMD_ARS_INJECT_SET, &acpi_desc->bus_nfit_cmd_force_en);
+	set_bit(NFIT_CMD_ARS_INJECT_CLEAR, &acpi_desc->bus_nfit_cmd_force_en);
+	set_bit(NFIT_CMD_ARS_INJECT_GET, &acpi_desc->bus_nfit_cmd_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1620,7 +1746,8 @@ static void nfit_test1_setup(struct nfit_test *t)
 	dcr->code = NFIT_FIC_BYTE;
 	dcr->windows = 0;
 
-	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
+	post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
+			SPA2_SIZE);
 
 	acpi_desc = &t->acpi_desc;
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
@@ -1718,7 +1845,10 @@ static int nfit_ctl_test(struct device *dev)
 			.module = THIS_MODULE,
 			.provider_name = "ACPI.NFIT",
 			.ndctl = acpi_nfit_ctl,
-			.bus_dsm_mask = 1UL << NFIT_CMD_TRANSLATE_SPA,
+			.bus_dsm_mask = 1UL << NFIT_CMD_TRANSLATE_SPA
+				| 1UL << NFIT_CMD_ARS_INJECT_SET
+				| 1UL << NFIT_CMD_ARS_INJECT_CLEAR
+				| 1UL << NFIT_CMD_ARS_INJECT_GET,
 		},
 		.dev = &adev->dev,
 	};
@@ -2017,6 +2147,10 @@ static __init int nfit_test_init(void)
 
 	nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
 
+	nfit_wq = create_singlethread_workqueue("nfit");
+	if (!nfit_wq)
+		return -ENOMEM;
+
 	nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm");
 	if (IS_ERR(nfit_test_dimm)) {
 		rc = PTR_ERR(nfit_test_dimm);
@@ -2033,6 +2167,7 @@ static __init int nfit_test_init(void)
 			goto err_register;
 		}
 		INIT_LIST_HEAD(&nfit_test->resources);
+		badrange_init(&nfit_test->badrange);
 		switch (i) {
 		case 0:
 			nfit_test->num_pm = NUM_PM;
@@ -2068,6 +2203,7 @@ static __init int nfit_test_init(void)
 			goto err_register;
 
 		instances[i] = nfit_test;
+		INIT_WORK(&nfit_test->work, uc_error_notify);
 	}
 
 	rc = platform_driver_register(&nfit_test_driver);
@@ -2076,6 +2212,7 @@ static __init int nfit_test_init(void)
 	return 0;
 
  err_register:
+	destroy_workqueue(nfit_wq);
 	for (i = 0; i < NUM_NFITS; i++)
 		if (instances[i])
 			platform_device_unregister(&instances[i]->pdev);
@@ -2091,6 +2228,8 @@ static __exit void nfit_test_exit(void)
 {
 	int i;
 
+	flush_workqueue(nfit_wq);
+	destroy_workqueue(nfit_wq);
 	for (i = 0; i < NUM_NFITS; i++)
 		platform_device_unregister(&instances[i]->pdev);
 	platform_driver_unregister(&nfit_test_driver);
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index 52c83be9dcfa..113b44675a71 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -33,6 +33,11 @@ struct nfit_test_resource {
 };
 
 #define ND_TRANSLATE_SPA_STATUS_INVALID_SPA  2
+#define NFIT_ARS_INJECT_INVALID 2
+
+enum err_inj_options {
+	ND_ARS_ERR_INJ_OPT_NOTIFY = 0,
+};
 
 /* nfit commands */
 enum nfit_cmd_num {
-- 
cgit v1.2.3


From 89360b87c3efa5d265a3b43dd8fc245c8f4356e3 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Mon, 30 Oct 2017 13:22:21 -0700
Subject: libnvdimm, badrange: remove a WARN for list_empty

Now that we're reusing the badrange functions for nfit_test, and that
exposes badrange injection/clearing to userspace via the DSM paths, it
is plausible that a user may call the clear DSM multiple times. Since it
is harmless to do so, we can remove the WARN in badrange_forget.

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/badrange.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
index 0b67dcf43234..e068d72b4357 100644
--- a/drivers/nvdimm/badrange.c
+++ b/drivers/nvdimm/badrange.c
@@ -114,7 +114,6 @@ void badrange_forget(struct badrange *badrange, phys_addr_t start,
 	struct badrange_entry *bre, *next;
 
 	spin_lock(&badrange->lock);
-	WARN_ON_ONCE(list_empty(badrange_list));
 
 	/*
 	 * [start, clr_end] is the badrange interval being cleared.
-- 
cgit v1.2.3


From 5e096ef3b29e6113a7aa6a7dc4871987efa99bcb Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Mon, 30 Oct 2017 13:22:23 -0700
Subject: nfit_test: when clearing poison, also remove badrange entries

The injected badrange entries can only be cleared from the kernel's
accounting by writing to the affected blocks, so when such a write sends
the clear errror DSM to nfit_test, also clear the ranges from
nfit_test's badrange list. This lets an 'ARS Inject error status' DSM to
return the correct status, omitting the cleared ranges.

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 788b1a5d6c5a..79c5cb23693f 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -344,7 +344,8 @@ static int nfit_test_cmd_ars_status(struct ars_state *ars_state,
 	return 0;
 }
 
-static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
+static int nfit_test_cmd_clear_error(struct nfit_test *t,
+		struct nd_cmd_clear_error *clear_err,
 		unsigned int buf_len, int *cmd_rc)
 {
 	const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1;
@@ -354,12 +355,7 @@ static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
 	if ((clear_err->address & mask) || (clear_err->length & mask))
 		return -EINVAL;
 
-	/*
-	 * Report 'all clear' success for all commands even though a new
-	 * scrub will find errors again.  This is enough to have the
-	 * error removed from the 'badblocks' tracking in the pmem
-	 * driver.
-	 */
+	badrange_forget(&t->badrange, clear_err->address, clear_err->length);
 	clear_err->status = 0;
 	clear_err->cleared = clear_err->length;
 	*cmd_rc = 0;
@@ -687,7 +683,7 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 					cmd_rc);
 			break;
 		case ND_CMD_CLEAR_ERROR:
-			rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc);
+			rc = nfit_test_cmd_clear_error(t, buf, buf_len, cmd_rc);
 			break;
 		default:
 			return -ENOTTY;
-- 
cgit v1.2.3


From 592e254502041f953e84d091eae2c68cba04c10b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 3 Nov 2017 12:21:21 +0100
Subject: mm: Handle 0 flags in _calc_vm_trans() macro

_calc_vm_trans() does not handle the situation when some of the passed
flags are 0 (which can happen if these VM flags do not make sense for
the architecture). Improve the _calc_vm_trans() macro to return 0 in
such situation. Since all passed flags are constant, this does not add
any runtime overhead.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/mman.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index c8367041fafd..edb6cf6a81ed 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -63,8 +63,9 @@ static inline bool arch_validate_prot(unsigned long prot)
  * ("bit1" and "bit2" must be single bits)
  */
 #define _calc_vm_trans(x, bit1, bit2) \
+  ((!(bit1) || !(bit2)) ? 0 : \
   ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
-   : ((x) & (bit1)) / ((bit1) / (bit2)))
+   : ((x) & (bit1)) / ((bit1) / (bit2))))
 
 /*
  * Combine the mmap "prot" argument into "vm_flags" used internally.
-- 
cgit v1.2.3


From 1c9725974074a047f6080eecc62c50a8e840d050 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 1 Nov 2017 16:36:30 +0100
Subject: mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new
 mmap flags

The mmap(2) syscall suffers from the ABI anti-pattern of not validating
unknown flags. However, proposals like MAP_SYNC need a mechanism to
define new behavior that is known to fail on older kernels without the
support. Define a new MAP_SHARED_VALIDATE flag pattern that is
guaranteed to fail on all legacy mmap implementations.

It is worth noting that the original proposal was for a standalone
MAP_VALIDATE flag. However, when that  could not be supported by all
archs Linus observed:

    I see why you *think* you want a bitmap. You think you want
    a bitmap because you want to make MAP_VALIDATE be part of MAP_SYNC
    etc, so that people can do

    ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED
		    | MAP_SYNC, fd, 0);

    and "know" that MAP_SYNC actually takes.

    And I'm saying that whole wish is bogus. You're fundamentally
    depending on special semantics, just make it explicit. It's already
    not portable, so don't try to make it so.

    Rename that MAP_VALIDATE as MAP_SHARED_VALIDATE, make it have a value
    of 0x3, and make people do

    ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED_VALIDATE
		    | MAP_SYNC, fd, 0);

    and then the kernel side is easier too (none of that random garbage
    playing games with looking at the "MAP_VALIDATE bit", but just another
    case statement in that map type thing.

    Boom. Done.

Similar to ->fallocate() we also want the ability to validate the
support for new flags on a per ->mmap() 'struct file_operations'
instance basis.  Towards that end arrange for flags to be generically
validated against a mmap_supported_flags exported by 'struct
file_operations'. By default all existing flags are implicitly
supported, but new flags require MAP_SHARED_VALIDATE and
per-instance-opt-in.

Cc: Jan Kara <jack@suse.cz>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Christoph Hellwig <hch@lst.de>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/alpha/include/uapi/asm/mman.h           |  1 +
 arch/mips/include/uapi/asm/mman.h            |  1 +
 arch/parisc/include/uapi/asm/mman.h          |  1 +
 arch/xtensa/include/uapi/asm/mman.h          |  1 +
 include/linux/fs.h                           |  1 +
 include/linux/mman.h                         | 39 ++++++++++++++++++++++++++++
 include/uapi/asm-generic/mman-common.h       |  1 +
 mm/mmap.c                                    | 15 +++++++++++
 tools/include/uapi/asm-generic/mman-common.h |  1 +
 9 files changed, 61 insertions(+)

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 3b26cc62dadb..f6d118aaedb9 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -11,6 +11,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping (OSF/1 is _wrong_) */
 #define MAP_FIXED	0x100		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index da3216007fe0..93268e4cd3c7 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -28,6 +28,7 @@
  */
 #define MAP_SHARED	0x001		/* Share changes */
 #define MAP_PRIVATE	0x002		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */
 
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 775b5d5e41a1..bca652aa1677 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -11,6 +11,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x03		/* Mask for type of mapping */
 #define MAP_FIXED	0x04		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index b15b278aa314..9ab426374714 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -35,6 +35,7 @@
  */
 #define MAP_SHARED	0x001		/* Share changes */
 #define MAP_PRIVATE	0x002		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 13dab191a23e..57added3201d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1701,6 +1701,7 @@ struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
+	unsigned long mmap_supported_flags;
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
diff --git a/include/linux/mman.h b/include/linux/mman.h
index edb6cf6a81ed..74452e3f2536 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -7,6 +7,45 @@
 #include <linux/atomic.h>
 #include <uapi/linux/mman.h>
 
+/*
+ * Arrange for legacy / undefined architecture specific flags to be
+ * ignored by default in LEGACY_MAP_MASK.
+ */
+#ifndef MAP_32BIT
+#define MAP_32BIT 0
+#endif
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB 0
+#endif
+#ifndef MAP_HUGE_1GB
+#define MAP_HUGE_1GB 0
+#endif
+#ifndef MAP_UNINITIALIZED
+#define MAP_UNINITIALIZED 0
+#endif
+
+/*
+ * The historical set of flags that all mmap implementations implicitly
+ * support when a ->mmap_validate() op is not provided in file_operations.
+ */
+#define LEGACY_MAP_MASK (MAP_SHARED \
+		| MAP_PRIVATE \
+		| MAP_FIXED \
+		| MAP_ANONYMOUS \
+		| MAP_DENYWRITE \
+		| MAP_EXECUTABLE \
+		| MAP_UNINITIALIZED \
+		| MAP_GROWSDOWN \
+		| MAP_LOCKED \
+		| MAP_NORESERVE \
+		| MAP_POPULATE \
+		| MAP_NONBLOCK \
+		| MAP_STACK \
+		| MAP_HUGETLB \
+		| MAP_32BIT \
+		| MAP_HUGE_2MB \
+		| MAP_HUGE_1GB)
+
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 203268f9231e..8ce7f5a0800f 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -16,6 +16,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
diff --git a/mm/mmap.c b/mm/mmap.c
index 680506faceae..924839fac0e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 
 	if (file) {
 		struct inode *inode = file_inode(file);
+		unsigned long flags_mask;
+
+		flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
 
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
+			/*
+			 * Force use of MAP_SHARED_VALIDATE with non-legacy
+			 * flags. E.g. MAP_SYNC is dangerous to use with
+			 * MAP_SHARED as you don't know which consistency model
+			 * you will get. We silently ignore unsupported flags
+			 * with MAP_SHARED to preserve backward compatibility.
+			 */
+			flags &= LEGACY_MAP_MASK;
+			/* fall through */
+		case MAP_SHARED_VALIDATE:
+			if (flags & ~flags_mask)
+				return -EOPNOTSUPP;
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 				return -EACCES;
 
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index 203268f9231e..8ce7f5a0800f 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -16,6 +16,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
-- 
cgit v1.2.3


From d81b8a722f691a3907eb4a6df410ab517ba5bfcc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:31 +0100
Subject: mm: Remove VM_FAULT_HWPOISON_LARGE_MASK

It is unused.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065d99deb847..ca72b67153d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1182,8 +1182,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
 #define VM_FAULT_DONE_COW   0x1000	/* ->fault has fully handled COW */
 
-#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
-
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
 			 VM_FAULT_FALLBACK)
-- 
cgit v1.2.3


From 31a6f1a6e5a4a26040b67d8fa4256539b36f5893 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:32 +0100
Subject: dax: Simplify arguments of dax_insert_mapping()

dax_insert_mapping() has lots of arguments and a lot of them is actuall
duplicated by passing vm_fault structure as well. Change the function to
take the same arguments as dax_pmd_insert_mapping().

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index f001d8c72a06..0bc42ac294ca 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -820,23 +820,30 @@ out:
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct address_space *mapping,
-		struct block_device *bdev, struct dax_device *dax_dev,
-		sector_t sector, size_t size, void *entry,
-		struct vm_area_struct *vma, struct vm_fault *vmf)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
+}
+
+static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+			      loff_t pos, void *entry)
 {
+	const sector_t sector = dax_iomap_sector(iomap, pos);
+	struct vm_area_struct *vma = vmf->vma;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long vaddr = vmf->address;
 	void *ret, *kaddr;
 	pgoff_t pgoff;
 	int id, rc;
 	pfn_t pfn;
 
-	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
 	if (rc)
 		return rc;
 
 	id = dax_read_lock();
-	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	rc = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(PAGE_SIZE),
+			       &kaddr, &pfn);
 	if (rc < 0) {
 		dax_read_unlock(id);
 		return rc;
@@ -936,11 +943,6 @@ int __dax_zero_page_range(struct block_device *bdev,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
-{
-	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
-}
-
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
@@ -1087,7 +1089,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
-	sector_t sector;
 	struct iomap iomap = { 0 };
 	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
@@ -1140,9 +1141,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto error_finish_iomap;
 	}
 
-	sector = dax_iomap_sector(&iomap, pos);
-
 	if (vmf->cow_page) {
+		sector_t sector = dax_iomap_sector(&iomap, pos);
+
 		switch (iomap.type) {
 		case IOMAP_HOLE:
 		case IOMAP_UNWRITTEN:
@@ -1175,8 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
-		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-				sector, PAGE_SIZE, entry, vmf->vma, vmf);
+		error = dax_insert_mapping(vmf, &iomap, pos, entry);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
-- 
cgit v1.2.3


From 5e161e4066d3ebeaff95a4b979b42f8bf00494d5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:33 +0100
Subject: dax: Factor out getting of pfn out of iomap

Factor out code to get pfn out of iomap that is shared between PTE and
PMD fault path.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 83 +++++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 0bc42ac294ca..116eef8d6c69 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -825,30 +825,53 @@ static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
 }
 
-static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-			      loff_t pos, void *entry)
+static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+			 pfn_t *pfnp)
 {
 	const sector_t sector = dax_iomap_sector(iomap, pos);
-	struct vm_area_struct *vma = vmf->vma;
-	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long vaddr = vmf->address;
-	void *ret, *kaddr;
 	pgoff_t pgoff;
+	void *kaddr;
 	int id, rc;
-	pfn_t pfn;
+	long length;
 
-	rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
+	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
 	if (rc)
 		return rc;
-
 	id = dax_read_lock();
-	rc = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(PAGE_SIZE),
-			       &kaddr, &pfn);
-	if (rc < 0) {
-		dax_read_unlock(id);
-		return rc;
+	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
+				   &kaddr, pfnp);
+	if (length < 0) {
+		rc = length;
+		goto out;
 	}
+	rc = -EINVAL;
+	if (PFN_PHYS(length) < size)
+		goto out;
+	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+		goto out;
+	/* For larger pages we need devmap */
+	if (length > 1 && !pfn_t_devmap(*pfnp))
+		goto out;
+	rc = 0;
+out:
 	dax_read_unlock(id);
+	return rc;
+}
+
+static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+			      loff_t pos, void *entry)
+{
+	const sector_t sector = dax_iomap_sector(iomap, pos);
+	struct vm_area_struct *vma = vmf->vma;
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long vaddr = vmf->address;
+	void *ret;
+	int rc;
+	pfn_t pfn;
+
+	rc = dax_iomap_pfn(iomap, pos, PAGE_SIZE, &pfn);
+	if (rc < 0)
+		return rc;
 
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
 	if (IS_ERR(ret))
@@ -1223,46 +1246,26 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	const sector_t sector = dax_iomap_sector(iomap, pos);
-	struct dax_device *dax_dev = iomap->dax_dev;
-	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
-	const size_t size = PMD_SIZE;
-	void *ret = NULL, *kaddr;
-	long length = 0;
-	pgoff_t pgoff;
+	void *ret = NULL;
 	pfn_t pfn = {};
-	int id;
+	int rc;
 
-	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
+	rc = dax_iomap_pfn(iomap, pos, PMD_SIZE, &pfn);
+	if (rc < 0)
 		goto fallback;
 
-	id = dax_read_lock();
-	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-	if (length < 0)
-		goto unlock_fallback;
-	length = PFN_PHYS(length);
-
-	if (length < size)
-		goto unlock_fallback;
-	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
-		goto unlock_fallback;
-	if (!pfn_t_devmap(pfn))
-		goto unlock_fallback;
-	dax_read_unlock(id);
-
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
 		goto fallback;
 
-	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
+	trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, ret);
 	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
 			pfn, vmf->flags & FAULT_FLAG_WRITE);
 
-unlock_fallback:
-	dax_read_unlock(id);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, PMD_SIZE, pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-- 
cgit v1.2.3


From a0987ad5c576626fdb40547f5ac99b4f90608dda Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:34 +0100
Subject: dax: Create local variable for VMA in dax_iomap_pte_fault()

There are already two users and more are coming.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 116eef8d6c69..c09465884bbe 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1108,7 +1108,8 @@ static int dax_fault_return(int error)
 static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			       const struct iomap_ops *ops)
 {
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	struct vm_area_struct *vma = vmf->vma;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1196,7 +1197,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
-			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
 		error = dax_insert_mapping(vmf, &iomap, pos, entry);
-- 
cgit v1.2.3


From d2c43ef13327478b299db0f58193495367c6d4ae Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:35 +0100
Subject: dax: Create local variable for vmf->flags & FAULT_FLAG_WRITE test

There are already two users and more are coming.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index c09465884bbe..5ea71381dba0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1116,6 +1116,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	struct iomap iomap = { 0 };
 	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	int vmf_ret = 0;
 	void *entry;
 
@@ -1130,7 +1131,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto out;
 	}
 
-	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+	if (write && !vmf->cow_page)
 		flags |= IOMAP_WRITE;
 
 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1207,7 +1208,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
-		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+		if (!write) {
 			vmf_ret = dax_load_hole(mapping, entry, vmf);
 			goto finish_iomap;
 		}
-- 
cgit v1.2.3


From 1b5a1cb21e0cdfb001050c76dc31039cdece1a63 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:36 +0100
Subject: dax: Inline dax_insert_mapping() into the callsite

dax_insert_mapping() has only one callsite and we will need to further
fine tune what it does for synchronous faults. Just inline it into the
callsite so that we don't have to pass awkward bools around.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 46 +++++++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5ea71381dba0..5b20c6456926 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -858,32 +858,6 @@ out:
 	return rc;
 }
 
-static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-			      loff_t pos, void *entry)
-{
-	const sector_t sector = dax_iomap_sector(iomap, pos);
-	struct vm_area_struct *vma = vmf->vma;
-	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long vaddr = vmf->address;
-	void *ret;
-	int rc;
-	pfn_t pfn;
-
-	rc = dax_iomap_pfn(iomap, pos, PAGE_SIZE, &pfn);
-	if (rc < 0)
-		return rc;
-
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
-	if (IS_ERR(ret))
-		return PTR_ERR(ret);
-
-	trace_dax_insert_mapping(mapping->host, vmf, ret);
-	if (vmf->flags & FAULT_FLAG_WRITE)
-		return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
-	else
-		return vm_insert_mixed(vma, vaddr, pfn);
-}
-
 /*
  * The user has performed a load from a hole in the file.  Allocating a new
  * page in the file would cause excessive storage usage for workloads with
@@ -1119,6 +1093,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	int vmf_ret = 0;
 	void *entry;
+	pfn_t pfn;
 
 	trace_dax_pte_fault(inode, vmf, vmf_ret);
 	/*
@@ -1201,7 +1176,24 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
-		error = dax_insert_mapping(vmf, &iomap, pos, entry);
+		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+		if (error < 0)
+			goto error_finish_iomap;
+
+		entry = dax_insert_mapping_entry(mapping, vmf, entry,
+						 dax_iomap_sector(&iomap, pos),
+						 0);
+		if (IS_ERR(entry)) {
+			error = PTR_ERR(entry);
+			goto error_finish_iomap;
+		}
+
+		trace_dax_insert_mapping(inode, vmf, entry);
+		if (write)
+			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+		else
+			error = vm_insert_mixed(vma, vaddr, pfn);
+
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
-- 
cgit v1.2.3


From 302a5e312b3a106fec49ba08da3f6545c9b7ee18 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:37 +0100
Subject: dax: Inline dax_pmd_insert_mapping() into the callsite

dax_pmd_insert_mapping() has only one callsite and we will need to
further fine tune what it does for synchronous faults. Just inline it
into the callsite so that we don't have to pass awkward bools around.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c                      | 47 +++++++++++++++++--------------------------
 include/trace/events/fs_dax.h |  1 -
 2 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5b20c6456926..675fab8ec41f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1235,33 +1235,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-		loff_t pos, void *entry)
-{
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	const sector_t sector = dax_iomap_sector(iomap, pos);
-	struct inode *inode = mapping->host;
-	void *ret = NULL;
-	pfn_t pfn = {};
-	int rc;
-
-	rc = dax_iomap_pfn(iomap, pos, PMD_SIZE, &pfn);
-	if (rc < 0)
-		goto fallback;
-
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
-			RADIX_DAX_PMD);
-	if (IS_ERR(ret))
-		goto fallback;
-
-	trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, ret);
-	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			pfn, vmf->flags & FAULT_FLAG_WRITE);
-
-fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vmf, PMD_SIZE, pfn, ret);
-	return VM_FAULT_FALLBACK;
-}
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		void *entry)
@@ -1317,6 +1295,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	void *entry;
 	loff_t pos;
 	int error;
+	pfn_t pfn;
 
 	/*
 	 * Check whether offset isn't beyond end of file now. Caller is
@@ -1394,7 +1373,19 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+		if (error < 0)
+			goto finish_iomap;
+
+		entry = dax_insert_mapping_entry(mapping, vmf, entry,
+						dax_iomap_sector(&iomap, pos),
+						RADIX_DAX_PMD);
+		if (IS_ERR(entry))
+			goto finish_iomap;
+
+		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+					    write);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index fbc4a06f7310..88a9d19b8ff8 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -148,7 +148,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
 	TP_ARGS(inode, vmf, length, pfn, radix_entry))
 
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
-DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
 
 DECLARE_EVENT_CLASS(dax_pte_fault_class,
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
-- 
cgit v1.2.3


From cec04e8c825eaa5a4bb1a3ce0d3784628e965196 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:38 +0100
Subject: dax: Fix comment describing dax_iomap_fault()

Add missing argument description.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 675fab8ec41f..5214ed9ba508 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1435,7 +1435,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
+ * @pe_size: Size of the page to fault in
+ * @ops: Iomap ops passed from the file system
  *
  * When a page fault occurs, filesystems may call this helper in
  * their fault handler for DAX files. dax_iomap_fault() assumes the caller
-- 
cgit v1.2.3


From 9a0dd42251439de635088b97533109a935864a84 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:39 +0100
Subject: dax: Allow dax_iomap_fault() to return pfn

For synchronous page fault dax_iomap_fault() will need to return PFN
which will then need to be inserted into page tables after fsync()
completes. Add necessary parameter to dax_iomap_fault().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            | 13 +++++++------
 fs/ext2/file.c      |  2 +-
 fs/ext4/file.c      |  2 +-
 fs/xfs/xfs_file.c   |  4 ++--
 include/linux/dax.h |  2 +-
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5214ed9ba508..5ddf15161390 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1079,7 +1079,7 @@ static int dax_fault_return(int error)
 	return VM_FAULT_SIGBUS;
 }
 
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1280,7 +1280,7 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1425,7 +1425,7 @@ out:
 	return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
@@ -1436,6 +1436,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vmf: The description of the fault
  * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
  * @ops: Iomap ops passed from the file system
  *
  * When a page fault occurs, filesystems may call this helper in
@@ -1444,13 +1445,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  * successfully.
  */
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-		    const struct iomap_ops *ops)
+		    pfn_t *pfnp, const struct iomap_ops *ops)
 {
 	switch (pe_size) {
 	case PE_SIZE_PTE:
-		return dax_iomap_pte_fault(vmf, ops);
+		return dax_iomap_pte_fault(vmf, pfnp, ops);
 	case PE_SIZE_PMD:
-		return dax_iomap_pmd_fault(vmf, ops);
+		return dax_iomap_pmd_fault(vmf, pfnp, ops);
 	default:
 		return VM_FAULT_FALLBACK;
 	}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index ff3a3636a5ca..d2bb7c96307d 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1da660ac3bc..3cec0b95672f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,7 +306,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
 	if (!IS_ERR(handle))
-		result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
+		result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
 	else
 		result = VM_FAULT_SIGBUS;
 	if (write) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 309e26c9dddb..7c6b8def6eed 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1040,7 +1040,7 @@ __xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops);
 	} else {
 		if (write_fault)
 			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1111,7 +1111,7 @@ xfs_filemap_pfn_mkwrite(
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 122197124b9d..e7fa4b8f45bc 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -95,7 +95,7 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-		    const struct iomap_ops *ops);
+		    pfn_t *pfnp, const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
-- 
cgit v1.2.3


From f5b7b74876cff5664ea8b2ef7f002c54cd6e7c90 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:40 +0100
Subject: dax: Allow tuning whether dax_insert_mapping_entry() dirties entry

Currently we dirty radix tree entry whenever dax_insert_mapping_entry()
gets called for a write fault. With synchronous page faults we would
like to insert clean radix tree entry and dirty it only once we call
fdatasync() and update page tables to save some unnecessary cache
flushing. Add 'dirty' argument to dax_insert_mapping_entry() for that.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5ddf15161390..efc210ff6665 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      struct vm_fault *vmf,
 				      void *entry, sector_t sector,
-				      unsigned long flags)
+				      unsigned long flags, bool dirty)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
 	void *new_entry;
 	pgoff_t index = vmf->pgoff;
 
-	if (vmf->flags & FAULT_FLAG_WRITE)
+	if (dirty)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
 	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		entry = new_entry;
 	}
 
-	if (vmf->flags & FAULT_FLAG_WRITE)
+	if (dirty)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 
 	spin_unlock_irq(&mapping->tree_lock);
@@ -881,7 +881,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 	}
 
 	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-			RADIX_DAX_ZERO_PAGE);
+			RADIX_DAX_ZERO_PAGE, false);
 	if (IS_ERR(entry2)) {
 		ret = VM_FAULT_SIGBUS;
 		goto out;
@@ -1182,7 +1182,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry,
 						 dax_iomap_sector(&iomap, pos),
-						 0);
+						 0, write);
 		if (IS_ERR(entry)) {
 			error = PTR_ERR(entry);
 			goto error_finish_iomap;
@@ -1258,7 +1258,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		goto fallback;
 
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
+			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
 	if (IS_ERR(ret))
 		goto fallback;
 
@@ -1379,7 +1379,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry,
 						dax_iomap_sector(&iomap, pos),
-						RADIX_DAX_PMD);
+						RADIX_DAX_PMD, write);
 		if (IS_ERR(entry))
 			goto finish_iomap;
 
-- 
cgit v1.2.3


From b6fb293f2497a9841d94f6b57bd2bb2cd222da43 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:41 +0100
Subject: mm: Define MAP_SYNC and VM_SYNC flags

Define new MAP_SYNC flag and corresponding VMA VM_SYNC flag. As the
MAP_SYNC flag is not part of LEGACY_MAP_MASK, currently it will be
refused by all MAP_SHARED_VALIDATE map attempts and silently ignored for
everything else.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/proc/task_mmu.c              | 1 +
 include/linux/mm.h              | 1 +
 include/linux/mman.h            | 8 ++++++--
 include/uapi/asm-generic/mman.h | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5589b4bd4b85..ea78b37deeaa 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -664,6 +664,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_SYNC)]	= "sf",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_WIPEONFORK)]	= "wf",
 		[ilog2(VM_DONTDUMP)]	= "dd",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca72b67153d5..5411cb7442de 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,6 +189,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
+#define VM_SYNC		0x00800000	/* Synchronous page faults */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 74452e3f2536..3427bf3daef5 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -9,7 +9,7 @@
 
 /*
  * Arrange for legacy / undefined architecture specific flags to be
- * ignored by default in LEGACY_MAP_MASK.
+ * ignored by mmap handling code.
  */
 #ifndef MAP_32BIT
 #define MAP_32BIT 0
@@ -23,6 +23,9 @@
 #ifndef MAP_UNINITIALIZED
 #define MAP_UNINITIALIZED 0
 #endif
+#ifndef MAP_SYNC
+#define MAP_SYNC 0
+#endif
 
 /*
  * The historical set of flags that all mmap implementations implicitly
@@ -126,7 +129,8 @@ calc_vm_flag_bits(unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
-	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
+	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
+	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      );
 }
 
 unsigned long vm_commit_limit(void);
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 7162cd4cca73..00e55627d2df 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -12,6 +12,7 @@
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
+#define MAP_SYNC	0x80000		/* perform synchronous page faults for the mapping */
 
 /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
 
-- 
cgit v1.2.3


From caa51d26f85c248f1c4f43a870ad3ef84bf9eb8f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:42 +0100
Subject: dax, iomap: Add support for synchronous faults

Add a flag to iomap interface informing the caller that inode needs
fdstasync(2) for returned extent to become persistent and use it in DAX
fault code so that we don't map such extents into page tables
immediately. Instead we propagate the information that fdatasync(2) is
necessary from dax_iomap_fault() with a new VM_FAULT_NEEDDSYNC flag.
Filesystem fault handler is then responsible for calling fdatasync(2)
and inserting pfn into page tables.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c              | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/iomap.h |  5 +++++
 include/linux/mm.h    |  6 +++++-
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index efc210ff6665..bb9ff907738c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1091,6 +1091,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	int vmf_ret = 0;
 	void *entry;
 	pfn_t pfn;
@@ -1169,6 +1170,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 		goto finish_iomap;
 	}
 
+	sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
@@ -1182,12 +1185,27 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry,
 						 dax_iomap_sector(&iomap, pos),
-						 0, write);
+						 0, write && !sync);
 		if (IS_ERR(entry)) {
 			error = PTR_ERR(entry);
 			goto error_finish_iomap;
 		}
 
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PTE into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp)) {
+				error = -EIO;
+				goto error_finish_iomap;
+			}
+			*pfnp = pfn;
+			vmf_ret = VM_FAULT_NEEDDSYNC | major;
+			goto finish_iomap;
+		}
 		trace_dax_insert_mapping(inode, vmf, entry);
 		if (write)
 			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
@@ -1287,6 +1305,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
@@ -1371,6 +1390,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto finish_iomap;
 
+	sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
 		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
@@ -1379,10 +1400,24 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry,
 						dax_iomap_sector(&iomap, pos),
-						RADIX_DAX_PMD, write);
+						RADIX_DAX_PMD, write && !sync);
 		if (IS_ERR(entry))
 			goto finish_iomap;
 
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PMD into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp))
+				goto finish_iomap;
+			*pfnp = pfn;
+			result = VM_FAULT_NEEDDSYNC;
+			goto finish_iomap;
+		}
+
 		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
 		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
 					    write);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f64dc6ce5161..73e3b7085dbe 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -22,6 +22,11 @@ struct vm_fault;
  * Flags for all iomap mappings:
  */
 #define IOMAP_F_NEW	0x01	/* blocks have been newly allocated */
+/*
+ * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
+ * written data and requires fdatasync to commit them to persistent storage.
+ */
+#define IOMAP_F_DIRTY	0x02
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5411cb7442de..f57e55782d7d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1182,6 +1182,9 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
 #define VM_FAULT_DONE_COW   0x1000	/* ->fault has fully handled COW */
+#define VM_FAULT_NEEDDSYNC  0x2000	/* ->fault did not modify page tables
+					 * and needs fsync() to complete (for
+					 * synchronous page faults in DAX) */
 
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
@@ -1199,7 +1202,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
 	{ VM_FAULT_LOCKED,		"LOCKED" }, \
 	{ VM_FAULT_RETRY,		"RETRY" }, \
 	{ VM_FAULT_FALLBACK,		"FALLBACK" }, \
-	{ VM_FAULT_DONE_COW,		"DONE_COW" }
+	{ VM_FAULT_DONE_COW,		"DONE_COW" }, \
+	{ VM_FAULT_NEEDDSYNC,		"NEEDDSYNC" }
 
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
-- 
cgit v1.2.3


From 71eab6dfd91eabf06fe782a590c07e8a0795f5ea Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:43 +0100
Subject: dax: Implement dax_finish_sync_fault()

Implement a function that filesystems can call to finish handling of
synchronous page faults. It takes care of syncing appropriare file range
and insertion of page table entry.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c                      | 83 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h           |  2 ++
 include/trace/events/fs_dax.h |  2 ++
 3 files changed, 87 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index bb9ff907738c..78233c716757 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1492,3 +1492,86 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 	}
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file.  It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+				  enum page_entry_size pe_size,
+				  pfn_t pfn)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	void *entry, **slot;
+	pgoff_t index = vmf->pgoff;
+	int vmf_ret, error;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* Did we race with someone splitting entry or so? */
+	if (!entry ||
+	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+		put_unlocked_mapping_entry(mapping, index, entry);
+		spin_unlock_irq(&mapping->tree_lock);
+		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+						      VM_FAULT_NOPAGE);
+		return VM_FAULT_NOPAGE;
+	}
+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+	entry = lock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+		vmf_ret = dax_fault_return(error);
+		break;
+#ifdef CONFIG_FS_DAX_PMD
+	case PE_SIZE_PMD:
+		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			pfn, true);
+		break;
+#endif
+	default:
+		vmf_ret = VM_FAULT_FALLBACK;
+	}
+	put_locked_mapping_entry(mapping, index);
+	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+	return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+			  pfn_t pfn)
+{
+	int err;
+	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+	size_t len = 0;
+
+	if (pe_size == PE_SIZE_PTE)
+		len = PAGE_SIZE;
+	else if (pe_size == PE_SIZE_PMD)
+		len = PMD_SIZE;
+	else
+		WARN_ON_ONCE(1);
+	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+	if (err)
+		return VM_FAULT_SIGBUS;
+	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e7fa4b8f45bc..d403f78b706c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -96,6 +96,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    pfn_t *pfnp, const struct iomap_ops *ops);
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+			  pfn_t pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 88a9d19b8ff8..7725459fafef 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,6 +190,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite);
 
 TRACE_EVENT(dax_insert_mapping,
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
-- 
cgit v1.2.3


From 497f6926d880c57f65bf7c3f1086526fa774c55e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:44 +0100
Subject: ext4: Simplify error handling in ext4_dax_huge_fault()

If transaction starting fails, just bail out of the function immediately
instead of checking for that condition throughout the function.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/ext4/file.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3cec0b95672f..208adfc3e673 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -302,16 +302,17 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 					       EXT4_DATA_TRANS_BLOCKS(sb));
+		if (IS_ERR(handle)) {
+			up_read(&EXT4_I(inode)->i_mmap_sem);
+			sb_end_pagefault(sb);
+			return VM_FAULT_SIGBUS;
+		}
 	} else {
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
-	if (!IS_ERR(handle))
-		result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
-	else
-		result = VM_FAULT_SIGBUS;
+	result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
 	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
+		ext4_journal_stop(handle);
 		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
 	} else {
-- 
cgit v1.2.3


From b8a6176c214cf9aa2679131ed7e4515cddaadc33 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:45 +0100
Subject: ext4: Support for synchronous DAX faults

We return IOMAP_F_DIRTY flag from ext4_iomap_begin() when asked to
prepare blocks for writing and the inode has some uncommitted metadata
changes. In the fault handler ext4_dax_fault() we then detect this case
(through VM_FAULT_NEEDDSYNC return value) and call helper
dax_finish_sync_fault() to flush metadata changes and insert page table
entry. Note that this will also dirty corresponding radix tree entry
which is what we want - fsync(2) will still provide data integrity
guarantees for applications not using userspace flushing. And
applications using userspace flushing can avoid calling fsync(2) and
thus avoid the performance overhead.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/ext4/file.c       | 15 ++++++++++++++-
 fs/ext4/inode.c      | 15 +++++++++++++++
 fs/jbd2/journal.c    | 17 +++++++++++++++++
 include/linux/jbd2.h |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 208adfc3e673..08a1d1a33a90 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -26,6 +26,7 @@
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/uio.h>
+#include <linux/mman.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -295,6 +296,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	 */
 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 		(vmf->vma->vm_flags & VM_SHARED);
+	pfn_t pfn;
 
 	if (write) {
 		sb_start_pagefault(sb);
@@ -310,9 +312,12 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	} else {
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
-	result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
 	if (write) {
 		ext4_journal_stop(handle);
+		/* Handling synchronous page fault? */
+		if (result & VM_FAULT_NEEDDSYNC)
+			result = dax_finish_sync_fault(vmf, pe_size, pfn);
 		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
 	} else {
@@ -350,6 +355,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
@@ -719,6 +731,7 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 31db875bc7a1..13a198924a0f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3394,6 +3394,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 #ifdef CONFIG_FS_DAX
+static bool ext4_inode_datasync_dirty(struct inode *inode)
+{
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+	if (journal)
+		return !jbd2_transaction_committed(journal,
+					EXT4_I(inode)->i_datasync_tid);
+	/* Any metadata buffers to write? */
+	if (!list_empty(&inode->i_mapping->private_list))
+		return true;
+	return inode->i_state & I_DIRTY_DATASYNC;
+}
+
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -3466,6 +3479,8 @@ retry:
 	}
 
 	iomap->flags = 0;
+	if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode))
+		iomap->flags |= IOMAP_F_DIRTY;
 	iomap->bdev = inode->i_sb->s_bdev;
 	iomap->dax_dev = sbi->s_daxdev;
 	iomap->offset = first_block << blkbits;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 7d5ef3bf3f3e..fa8cde498b4b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -738,6 +738,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+	int ret = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid)
+		ret = 0;
+	if (journal->j_committing_transaction &&
+	    journal->j_committing_transaction->t_tid == tid)
+		ret = 0;
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
 /*
  * When this function returns the transaction corresponding to tid
  * will be completed.  If the transaction has currently running, start
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 606b6bce3a5b..296d1e0ea87b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_transaction_committed(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
-- 
cgit v1.2.3


From 7b565c9f965bac269cc6a1f25c819f77044abf9d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:46 +0100
Subject: xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()

xfs_filemap_pfn_mkwrite() duplicates a lot of __xfs_filemap_fault().
It will also need to handle flushing for synchronous page faults. So
just make that function use __xfs_filemap_fault().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c  | 29 ++++-------------------------
 fs/xfs/xfs_trace.h |  2 --
 2 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7c6b8def6eed..4496b45678de 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1085,37 +1085,16 @@ xfs_filemap_page_mkwrite(
 }
 
 /*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
  */
 static int
 xfs_filemap_pfn_mkwrite(
 	struct vm_fault		*vmf)
 {
 
-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ret = VM_FAULT_NOPAGE;
-	loff_t			size;
-
-	trace_xfs_filemap_pfn_mkwrite(ip);
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-
-	/* check if the faulting page hasn't raced with truncate */
-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops);
-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bb5514688d47..6333ad09e0f3 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -688,8 +688,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
-
 TRACE_EVENT(xfs_filemap_fault,
 	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
 		 bool write_fault),
-- 
cgit v1.2.3


From a39e596baa07cb1dc19c2ead14c9fd2a30f22352 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 Nov 2017 16:36:47 +0100
Subject: xfs: support for synchronous DAX faults

Return IOMAP_F_DIRTY from xfs_file_iomap_begin() when asked to prepare
blocks for writing and the inode is pinned, and has dirty fields other
than the timestamps.  In __xfs_filemap_fault() we then detect this case
and call dax_finish_sync_fault() to make sure all metadata is committed,
and to insert the page table entry.

Note that this will also dirty corresponding radix tree entry which is
what we want - fsync(2) will still provide data integrity guarantees for
applications not using userspace flushing. And applications using
userspace flushing can avoid calling fsync(2) and thus avoid the
performance overhead.

[JK: Added VM_SYNC flag handling]

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c  | 15 ++++++++++++++-
 fs/xfs/xfs_iomap.c |  5 +++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4496b45678de..4827e82d5d2c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
+#include <linux/mman.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -1040,7 +1041,11 @@ __xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops);
+		pfn_t pfn;
+
+		ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
+		if (ret & VM_FAULT_NEEDDSYNC)
+			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
 	} else {
 		if (write_fault)
 			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1110,6 +1115,13 @@ xfs_file_mmap(
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(filp);
 	vma->vm_ops = &xfs_file_vm_ops;
 	if (IS_DAX(file_inode(filp)))
@@ -1128,6 +1140,7 @@ const struct file_operations xfs_file_operations = {
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
 	.mmap		= xfs_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f179bdf1644d..b43be199fbdf 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -33,6 +33,7 @@
 #include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
@@ -1086,6 +1087,10 @@ xfs_file_iomap_begin(
 		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
 	}
 
+	if ((flags & IOMAP_WRITE) && xfs_ipincount(ip) &&
+	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+		iomap->flags |= IOMAP_F_DIRTY;
+
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
 
 	if (shared)
-- 
cgit v1.2.3


From 41cb3301c04d7f0b046ed296a34ef261c456dde8 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Tue, 7 Nov 2017 15:48:21 -0700
Subject: tools/testing/nvdimm: stricter bounds checking for error injection
 commands

Ensure that the in/out sizes passed in the nd_cmd_package are sane for
the fixed output size commands (i.e. inject error and clear injected
error).

Reported-by: Dariusz Dokupil <dariusz.dokupil@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 79c5cb23693f..edab68c2e221 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -488,7 +488,7 @@ static int nfit_test_cmd_ars_error_inject(struct nfit_test *t,
 {
 	int rc;
 
-	if (buf_len < sizeof(*err_inj)) {
+	if (buf_len != sizeof(*err_inj)) {
 		rc = -EINVAL;
 		goto err;
 	}
@@ -519,7 +519,7 @@ static int nfit_test_cmd_ars_inject_clear(struct nfit_test *t,
 {
 	int rc;
 
-	if (buf_len < sizeof(*err_clr)) {
+	if (buf_len != sizeof(*err_clr)) {
 		rc = -EINVAL;
 		goto err;
 	}
-- 
cgit v1.2.3


From 0e7f0741450b1b800b3acfc2319e7f24e8a20fcf Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 12 Nov 2017 14:57:09 -0800
Subject: acpi, nfit: validate commands against the device type

Fix occasions in acpi_nfit_ctl where we check the command type without
validating whether we are parsing dimm vs bus level commands. Where the
command numbers alias between dimms and bus we can make the wrong
assumption just checking the raw command number. For example, with a
simple nfit_test mock up of the clear-error command we trigger the
following:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000094
    IP: acpi_nfit_ctl+0x29b/0x930 [nfit]
    [..]
    Call Trace:
     nfit_test_probe+0xb85/0xc09 [nfit_test]
     platform_drv_probe+0x3b/0xa0
     ? platform_drv_probe+0x3b/0xa0
     driver_probe_device+0x29c/0x450
     ? test_alloc+0x180/0x180 [nfit_test]
     __driver_attach+0xe3/0xf0
     ? driver_probe_device+0x450/0x450
     bus_for_each_dev+0x73/0xc0
     driver_attach+0x1e/0x20
     bus_add_driver+0x173/0x270
     driver_register+0x60/0xe0
     __platform_driver_register+0x36/0x40
     nfit_test_init+0x2a1/0x1000 [nfit_test]

Fixes: 4b27db7e26cd ("acpi, nfit: add support for the _LSI, _LSR, and...")
Reported-by: Vishal Verma <vishal.l.verma@intel.com>
Tested-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 8043bfde7c63..ff2580e7611d 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -483,13 +483,14 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 			min_t(u32, 256, in_buf.buffer.length), true);
 
 	/* call the BIOS, prefer the named methods over _DSM if available */
-	if (cmd == ND_CMD_GET_CONFIG_SIZE && nfit_mem->has_lsi)
+	if (nvdimm && cmd == ND_CMD_GET_CONFIG_SIZE && nfit_mem->has_lsi)
 		out_obj = acpi_label_info(handle);
-	else if (cmd == ND_CMD_GET_CONFIG_DATA && nfit_mem->has_lsr) {
+	else if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && nfit_mem->has_lsr) {
 		struct nd_cmd_get_config_data_hdr *p = buf;
 
 		out_obj = acpi_label_read(handle, p->in_offset, p->in_length);
-	} else if (cmd == ND_CMD_SET_CONFIG_DATA && nfit_mem->has_lsw) {
+	} else if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA
+			&& nfit_mem->has_lsw) {
 		struct nd_cmd_set_config_hdr *p = buf;
 
 		out_obj = acpi_label_write(handle, p->in_offset, p->in_length,
@@ -497,7 +498,7 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 	} else {
 		u8 revid;
 
-		if (nfit_mem)
+		if (nvdimm)
 			revid = nfit_dsm_revid(nfit_mem->family, func);
 		else
 			revid = 1;
@@ -565,8 +566,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 	 * Set fw_status for all the commands with a known format to be
 	 * later interpreted by xlat_status().
 	 */
-	if (i >= 1 && ((cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR)
-			|| (cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR)))
+	if (i >= 1 && ((!nvdimm && cmd >= ND_CMD_ARS_CAP
+					&& cmd <= ND_CMD_CLEAR_ERROR)
+				|| (nvdimm && cmd >= ND_CMD_SMART
+					&& cmd <= ND_CMD_VENDOR)))
 		fw_status = *(u32 *) out_obj->buffer.pointer;
 
 	if (offset + in_buf.buffer.length < buf_len) {
-- 
cgit v1.2.3


From fb2a1748355161e050e9f49f1ea9a0ae707a148b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 12 Nov 2017 14:54:23 -0800
Subject: tools/testing/nvdimm: unit test clear-error commands

Validate command parsing in acpi_nfit_ctl for the clear error command.
This tests for a crash condition introduced by commit 4b27db7e26cd
"acpi, nfit: add support for the _LSI, _LSR, and _LSW label methods".

Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index edab68c2e221..7217b2b953b5 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1812,6 +1812,7 @@ static int nfit_ctl_test(struct device *dev)
 	unsigned long mask, cmd_size, offset;
 	union {
 		struct nd_cmd_get_config_size cfg_size;
+		struct nd_cmd_clear_error clear_err;
 		struct nd_cmd_ars_status ars_stat;
 		struct nd_cmd_ars_cap ars_cap;
 		char buf[sizeof(struct nd_cmd_ars_status)
@@ -1995,6 +1996,23 @@ static int nfit_ctl_test(struct device *dev)
 		return -EIO;
 	}
 
+	/* test clear error */
+	cmd_size = sizeof(cmds.clear_err);
+	cmds.clear_err = (struct nd_cmd_clear_error) {
+		.length = 512,
+		.cleared = 512,
+	};
+	rc = setup_result(cmds.buf, cmd_size);
+	if (rc)
+		return rc;
+	rc = acpi_nfit_ctl(&acpi_desc->nd_desc, NULL, ND_CMD_CLEAR_ERROR,
+			cmds.buf, cmd_size, &cmd_rc);
+	if (rc < 0 || cmd_rc) {
+		dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n",
+				__func__, __LINE__, rc, cmd_rc);
+		return -EIO;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From aaa422c4c3f6ee958ea9d6c9260ac40f90a3f4e9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 13 Nov 2017 16:38:44 -0800
Subject: fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax
 core

While reviewing whether MAP_SYNC should strengthen its current guarantee
of syncing writes from the initiating process to also include
third-party readers observing dirty metadata, Dave pointed out that the
check of IOMAP_WRITE is misplaced.

The policy of what to with IOMAP_F_DIRTY should be separated from the
generic filesystem mechanism of reporting dirty metadata. Move this
policy to the fs-dax core to simplify the per-filesystem iomap handlers,
and further centralize code that implements the MAP_SYNC policy. This
otherwise should not change behavior, it just makes it easier to change
behavior in the future.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c           | 15 +++++++++++++--
 fs/ext4/inode.c    |  2 +-
 fs/xfs/xfs_iomap.c |  4 ++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 78233c716757..27ba300660ff 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1079,6 +1079,17 @@ static int dax_fault_return(int error)
 	return VM_FAULT_SIGBUS;
 }
 
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(unsigned long flags,
+		struct vm_area_struct *vma, struct iomap *iomap)
+{
+	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
+		&& (iomap->flags & IOMAP_F_DIRTY);
+}
+
 static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
@@ -1170,7 +1181,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 		goto finish_iomap;
 	}
 
-	sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY);
+	sync = dax_fault_is_synchronous(flags, vma, &iomap);
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
@@ -1390,7 +1401,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto finish_iomap;
 
-	sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY);
+	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 13a198924a0f..ee4d907a4251 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3479,7 +3479,7 @@ retry:
 	}
 
 	iomap->flags = 0;
-	if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode))
+	if (ext4_inode_datasync_dirty(inode))
 		iomap->flags |= IOMAP_F_DIRTY;
 	iomap->bdev = inode->i_sb->s_bdev;
 	iomap->dax_dev = sbi->s_daxdev;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index b43be199fbdf..3c0e4cf72d2b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1087,8 +1087,8 @@ xfs_file_iomap_begin(
 		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
 	}
 
-	if ((flags & IOMAP_WRITE) && xfs_ipincount(ip) &&
-	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+	if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
+				& ~XFS_ILOG_TIMESTAMP))
 		iomap->flags |= IOMAP_F_DIRTY;
 
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
-- 
cgit v1.2.3


From 66a86cc10945648cf506ef314b98deeb7af06419 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 14 Oct 2017 11:32:33 -0700
Subject: dax: quiet bdev_dax_supported()

Before we add another failure reason, quiet the existing log messages.
Leave it to the caller to decide if bdev_dax_supported() failures are
errors worth emitting to the log.

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 557b93703532..b0cc8117eebe 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	long len;
 
 	if (blocksize != PAGE_SIZE) {
-		pr_err("VFS (%s): error: unsupported blocksize for dax\n",
+		pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
 				sb->s_id);
 		return -EINVAL;
 	}
 
 	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 	if (err) {
-		pr_err("VFS (%s): error: unaligned partition for dax\n",
+		pr_debug("VFS (%s): error: unaligned partition for dax\n",
 				sb->s_id);
 		return err;
 	}
 
 	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	if (!dax_dev) {
-		pr_err("VFS (%s): error: device does not support dax\n",
+		pr_debug("VFS (%s): error: device does not support dax\n",
 				sb->s_id);
 		return -EOPNOTSUPP;
 	}
@@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	put_dax(dax_dev);
 
 	if (len < 1) {
-		pr_err("VFS (%s): error: dax access failed (%ld)",
+		pr_debug("VFS (%s): error: dax access failed (%ld)\n",
 				sb->s_id, len);
 		return len < 0 ? len : -EIO;
 	}
-- 
cgit v1.2.3


From 7a862fbbdec665190c5ef298c0c6ec9f3915cf45 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 23 Oct 2017 10:24:45 -0700
Subject: brd: remove dax support

DAX support in brd is awkward because its backing page frames are
distinct from the ones provided by pmem, dcssblk, or axonram. We need
pfn_t_devmap() entries to fully support DAX, and the limited DAX support
for pfn_t_special() page frames is not interesting for brd when pmem is
already a superset of brd.  Lastly, brd is the only dax capable driver
that may sleep in its ->direct_access() implementation. So it causes a
global burden with no net gain of kernel functionality.

For all these reasons, remove DAX support.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/block/Kconfig | 12 ----------
 drivers/block/brd.c   | 65 ---------------------------------------------------
 2 files changed, 77 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 2dfe99b328f8..da8bf0268ade 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -297,7 +297,6 @@ config BLK_DEV_SX8
 
 config BLK_DEV_RAM
 	tristate "RAM block device support"
-	select DAX if BLK_DEV_RAM_DAX
 	---help---
 	  Saying Y here will allow you to use a portion of your RAM memory as
 	  a block device, so that you can make file systems on it, read and
@@ -333,17 +332,6 @@ config BLK_DEV_RAM_SIZE
 	  The default value is 4096 kilobytes. Only change this if you know
 	  what you are doing.
 
-config BLK_DEV_RAM_DAX
-	bool "Support Direct Access (DAX) to RAM block devices"
-	depends on BLK_DEV_RAM && FS_DAX
-	default n
-	help
-	  Support filesystems using DAX to access RAM block devices.  This
-	  avoids double-buffering data in the page cache before copying it
-	  to the block device.  Answering Y will slightly enlarge the kernel,
-	  and will prevent RAM block device backing store memory from being
-	  allocated from highmem (only a problem for highmem systems).
-
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media (DEPRECATED)"
 	depends on !UML
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 2d7178f7754e..b2391bbd7e5a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -20,11 +20,6 @@
 #include <linux/radix-tree.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-#include <linux/pfn_t.h>
-#include <linux/dax.h>
-#include <linux/uio.h>
-#endif
 
 #include <linux/uaccess.h>
 
@@ -44,9 +39,6 @@ struct brd_device {
 
 	struct request_queue	*brd_queue;
 	struct gendisk		*brd_disk;
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	struct dax_device	*dax_dev;
-#endif
 	struct list_head	brd_list;
 
 	/*
@@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
 	 * restriction might be able to be lifted.
 	 */
 	gfp_flags = GFP_NOIO | __GFP_ZERO;
-#ifndef CONFIG_BLK_DEV_RAM_DAX
-	gfp_flags |= __GFP_HIGHMEM;
-#endif
 	page = alloc_page(gfp_flags);
 	if (!page)
 		return NULL;
@@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 	return err;
 }
 
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
-		long nr_pages, void **kaddr, pfn_t *pfn)
-{
-	struct page *page;
-
-	if (!brd)
-		return -ENODEV;
-	page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT);
-	if (!page)
-		return -ENOSPC;
-	*kaddr = page_address(page);
-	*pfn = page_to_pfn_t(page);
-
-	return 1;
-}
-
-static long brd_dax_direct_access(struct dax_device *dax_dev,
-		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
-{
-	struct brd_device *brd = dax_get_private(dax_dev);
-
-	return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
-}
-
-static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
-		void *addr, size_t bytes, struct iov_iter *i)
-{
-	return copy_from_iter(addr, bytes, i);
-}
-
-static const struct dax_operations brd_dax_ops = {
-	.direct_access = brd_dax_direct_access,
-	.copy_from_iter = brd_dax_copy_from_iter,
-};
-#endif
-
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		brd_rw_page,
@@ -450,21 +402,8 @@ static struct brd_device *brd_alloc(int i)
 	sprintf(disk->disk_name, "ram%d", i);
 	set_capacity(disk, rd_size * 2);
 
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
-	brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
-	if (!brd->dax_dev)
-		goto out_free_inode;
-#endif
-
-
 	return brd;
 
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-out_free_inode:
-	kill_dax(brd->dax_dev);
-	put_dax(brd->dax_dev);
-#endif
 out_free_queue:
 	blk_cleanup_queue(brd->brd_queue);
 out_free_dev:
@@ -504,10 +443,6 @@ out:
 static void brd_del_one(struct brd_device *brd)
 {
 	list_del(&brd->brd_list);
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	kill_dax(brd->dax_dev);
-	put_dax(brd->dax_dev);
-#endif
 	del_gendisk(brd->brd_disk);
 	brd_free(brd);
 }
-- 
cgit v1.2.3


From 6a21586a637e624ae736f94aeb0839f6a1dd0411 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 14 Oct 2017 17:42:02 -0700
Subject: dax: stop requiring a live device for dax_flush()

Now that dax_flush() is no longer a driver callback (commit c3ca015fab6d
"dax: remove the pmem_dax_ops->flush abstraction"), stop requiring the
dax_read_lock() to be held and the device to be alive.  This is in
preparation for switching filesystem-dax to store pfns instead of
sectors in the radix.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index b0cc8117eebe..69329e3954ea 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 {
-	if (unlikely(!dax_alive(dax_dev)))
-		return;
-
 	if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
 		return;
 
-- 
cgit v1.2.3


From 957ac8c421ad8b5eef9b17fe98e146d8311a541e Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 14 Nov 2017 20:37:27 -0500
Subject: dax: fix PMD faults on zero-length files

PMD faults on a zero length file on a file system mounted with -o dax
will not generate SIGBUS as expected.

	fd = open(...O_TRUNC);
	addr = mmap(NULL, 2*1024*1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	*addr = 'a';
        <expect SIGBUS>

The problem is this code in dax_iomap_pmd_fault:

	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;

If the inode size is zero, we end up with a max_pgoff that is way larger
than 0.  :)  Fix it by using DIV_ROUND_UP, as is done elsewhere in the
kernel.

I tested this with some simple test code that ensured that SIGBUS was
received where expected.

Cc: <stable@vger.kernel.org>
Fixes: 642261ac995e ("dax: add struct iomap based DAX PMD support")
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 27ba300660ff..f757cd0e2d07 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1333,7 +1333,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * this is a reliable test.
 	 */
 	pgoff = linear_page_index(vma, pmd_addr);
-	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 
 	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
@@ -1357,13 +1357,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	if (pgoff > max_pgoff) {
+	if (pgoff >= max_pgoff) {
 		result = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
 	/* If the PMD would extend beyond the file size */
-	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
 		goto fallback;
 
 	/*
-- 
cgit v1.2.3


From 9f586fff6574f6ecbf323f92d44ffaf0d96225fe Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 14 Nov 2017 09:59:54 -0500
Subject: dax: fix general protection fault in dax_alloc_inode

Don't crash in case of allocation failure in dax_alloc_inode.

    syzkaller hit the following crash on e4880bc5dfb1

    kasan: CONFIG_KASAN_INLINE enabled
    kasan: GPF could be caused by NULL-ptr deref or user memory access
    [..]
    RIP: 0010:dax_alloc_inode+0x3b/0x70 drivers/dax/super.c:348
    Call Trace:
    alloc_inode+0x65/0x180 fs/inode.c:208
    new_inode_pseudo+0x69/0x190 fs/inode.c:890
    new_inode+0x1c/0x40 fs/inode.c:919
    mount_pseudo_xattr+0x288/0x560 fs/libfs.c:261
    mount_pseudo include/linux/fs.h:2137 [inline]
    dax_mount+0x2e/0x40 drivers/dax/super.c:388
    mount_fs+0x66/0x2d0 fs/super.c:1223

Cc: <stable@vger.kernel.org>
Fixes: 7b6be8444e0f ("dax: refactor dax-fs into a generic provider...")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 69329e3954ea..3ec804672601 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -341,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb)
 	struct inode *inode;
 
 	dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
+	if (!dax_dev)
+		return NULL;
+
 	inode = &dax_dev->inode;
 	inode->i_rdev = 0;
 	return inode;
-- 
cgit v1.2.3


From 79ab67ede21f536851a99ea68ee6fc1f5435e055 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 15 Nov 2017 10:10:48 -0800
Subject: acpi, nfit: add 'Enable Latch System Shutdown Status' command support

The NVDIMM_FAMILY_INTEL 'Enable Latch System Shutdown Status' command
indicates to the platform that system software has acknowledged the most
recent unsafe shutdown status.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/nfit.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 341be9511d0e..f0cf18b2da8b 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -51,6 +51,7 @@
  * non-default DSM revision ids
  */
 enum nvdimm_family_cmds {
+	NVDIMM_INTEL_LATCH_SHUTDOWN = 10,
 	NVDIMM_INTEL_GET_MODES = 11,
 	NVDIMM_INTEL_GET_FWINFO = 12,
 	NVDIMM_INTEL_START_FWUPDATE = 13,
@@ -66,7 +67,7 @@ enum nvdimm_family_cmds {
  | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \
  | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \
  | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \
- | 1 << NVDIMM_INTEL_INJECT_ERROR)
+ | 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN)
 
 enum nfit_uuids {
 	/* for simplicity alias the uuid index with the family id */
-- 
cgit v1.2.3