arm64/io: Provide a WC friendly __iowriteXX_copy()

The kernel provides driver support for using write combining IO memory through the __iowriteXX_copy() API which is commonly used as an optional optimization to generate 16/32/64 byte MemWr TLPs in a PCIe environment. iomap_copy.c provides a generic implementation as a simple 4/8 byte at a time copy loop that has worked well with past ARM64 CPUs, giving a high frequency of large TLPs being successfully formed. However modern ARM64 CPUs are quite sensitive to how the write combining CPU HW is operated and a compiler generated loop with intermixed load/store is not sufficient to frequently generate a large TLP. The CPUs would like to see the entire TLP generated by consecutive store instructions from registers. Compilers like gcc tend to intermix loads and stores and have poor code generation, in part, due to the ARM64 situation that writeq() does not codegen anything other than "[xN]". However even with that resolved compilers like clang still do not have good code generation. This means on modern ARM64 CPUs the rate at which __iowriteXX_copy() successfully generates large TLPs is very small (less than 1 in 10,000) tries), to the point that the use of WC is pointless. Implement __iowrite32/64_copy() specifically for ARM64 and use inline assembly to build consecutive blocks of STR instructions. Provide direct support for 64/32/16 large TLP generation in this manner. Optimize for common constant lengths so that the compiler can directly inline the store blocks. This brings the frequency of large TLP generation up to a high level that is comparable with older CPU generations. As the __iowriteXX_copy() family of APIs is intended for use with WC incorporate the DGH hint directly into the function. Link: https://lore.kernel.org/r/4-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: linux-arch@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
author: Jason Gunthorpe <jgg@nvidia.com> 2024-04-11 13:46:17 -0300
committer: Jason Gunthorpe <jgg@nvidia.com> 2024-04-22 17:11:20 -0300
commit: ead79118dae6f9f982532002e82c2fb291ae0480 (patch)
tree: 2efdf98ee20283a3a44f827e5d977e69cceb4bd7 /arch/arm64/kernel
parent: e7bc47b16622d1016b3b77bbdb20fb9e213045f2 (diff)
download: linux-stable-ead79118dae6f9f982532002e82c2fb291ae0480.tar.gz
linux-stable-ead79118dae6f9f982532002e82c2fb291ae0480.tar.bz2
linux-stable-ead79118dae6f9f982532002e82c2fb291ae0480.zip
1 files changed, 42 insertions, 0 deletions
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index aa7a4ec6a3ae..ef48089fbfe1 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -38,6 +38,48 @@ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
 EXPORT_SYMBOL(__memcpy_fromio);
 
 /*
+ * This generates a memcpy that works on a from/to address which is aligned to
+ * bits. Count is in terms of the number of bits sized quantities to copy. It
+ * optimizes to use the STR groupings when possible so that it is WC friendly.
+ */
+#define memcpy_toio_aligned(to, from, count, bits)                        \
+	({                                                                \
+		volatile u##bits __iomem *_to = to;                       \
+		const u##bits *_from = from;                              \
+		size_t _count = count;                                    \
+		const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
+                                                                          \
+		for (; _from < _end_from; _from += 8, _to += 8)           \
+			__const_memcpy_toio_aligned##bits(_to, _from, 8); \
+		if ((_count % 8) >= 4) {                                  \
+			__const_memcpy_toio_aligned##bits(_to, _from, 4); \
+			_from += 4;                                       \
+			_to += 4;                                         \
+		}                                                         \
+		if ((_count % 4) >= 2) {                                  \
+			__const_memcpy_toio_aligned##bits(_to, _from, 2); \
+			_from += 2;                                       \
+			_to += 2;                                         \
+		}                                                         \
+		if (_count % 2)                                           \
+			__const_memcpy_toio_aligned##bits(_to, _from, 1); \
+	})
+
+void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
+{
+	memcpy_toio_aligned(to, from, count, 64);
+	dgh();
+}
+EXPORT_SYMBOL(__iowrite64_copy_full);
+
+void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
+{
+	memcpy_toio_aligned(to, from, count, 32);
+	dgh();
+}
+EXPORT_SYMBOL(__iowrite32_copy_full);
+
+/*
  * Copy data from "real" memory space to IO memory space.
  */
 void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
author	Jason Gunthorpe <jgg@nvidia.com>	2024-04-11 13:46:17 -0300
committer	Jason Gunthorpe <jgg@nvidia.com>	2024-04-22 17:11:20 -0300
commit	ead79118dae6f9f982532002e82c2fb291ae0480 (patch)
tree	2efdf98ee20283a3a44f827e5d977e69cceb4bd7 /arch/arm64/kernel
parent	e7bc47b16622d1016b3b77bbdb20fb9e213045f2 (diff)
download	linux-stable-ead79118dae6f9f982532002e82c2fb291ae0480.tar.gz linux-stable-ead79118dae6f9f982532002e82c2fb291ae0480.tar.bz2 linux-stable-ead79118dae6f9f982532002e82c2fb291ae0480.zip