summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 15:40:57 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 15:40:57 -1000
commitf0d25b5d0f8ef0ad35f1beff17da5843279d47a1 (patch)
tree73bb0fea8ed4a96d01517349dfb2588c3ae37e6d
parent1641b9b04002c22f616a51a164c04b7f679d241f (diff)
parenta1e2b8b36820d8c91275f207e77e91645b7c6836 (diff)
downloadlinux-f0d25b5d0f8ef0ad35f1beff17da5843279d47a1.tar.gz
linux-f0d25b5d0f8ef0ad35f1beff17da5843279d47a1.tar.bz2
linux-f0d25b5d0f8ef0ad35f1beff17da5843279d47a1.zip
Merge tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm handling updates from Ingo Molnar: - Add new NX-stack self-test - Improve NUMA partial-CFMWS handling - Fix #VC handler bugs resulting in SEV-SNP boot failures - Drop the 4MB memory size restriction on minimal NUMA nodes - Reorganize headers a bit, in preparation to header dependency reduction efforts - Misc cleanups & fixes * tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm: Drop the 4 MB restriction on minimal NUMA node memory size selftests/x86/lam: Zero out buffer for readlink() x86/sev: Drop unneeded #include x86/sev: Move sev_setup_arch() to mem_encrypt.c x86/tdx: Replace deprecated strncpy() with strtomem_pad() selftests/x86/mm: Add new test that userspace stack is in fact NX x86/sev: Make boot_ghcb_page[] static x86/boot: Move x86_cache_alignment initialization to correct spot x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach x86/sev-es: Allow copy_from_kernel_nofault() in earlier boot x86_64: Show CR4.PSE on auxiliaries like on BSP x86/iommu/docs: Update AMD IOMMU specification document URL x86/sev/docs: Update document URL in amd-memory-encryption.rst x86/mm: Move arch_memory_failure() and arch_is_platform_page() definitions from <asm/processor.h> to <asm/pgtable.h> ACPI/NUMA: Apply SRAT proximity domain to entire CFMWS window x86/numa: Introduce numa_fill_memblks()
-rw-r--r--Documentation/arch/x86/amd-memory-encryption.rst2
-rw-r--r--Documentation/arch/x86/iommu.rst2
-rw-r--r--arch/x86/boot/compressed/sev.c2
-rw-r--r--arch/x86/coco/tdx/tdx.c2
-rw-r--r--arch/x86/include/asm/mem_encrypt.h4
-rw-r--r--arch/x86/include/asm/numa.h7
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/processor.h8
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/kernel/cpu/common.c40
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/mm/maccess.c19
-rw-r--r--arch/x86/mm/mem_encrypt.c34
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c36
-rw-r--r--arch/x86/mm/numa.c87
-rw-r--r--drivers/acpi/numa/srat.c11
-rw-r--r--include/linux/numa.h7
-rw-r--r--tools/testing/selftests/x86/Makefile4
-rw-r--r--tools/testing/selftests/x86/lam.c6
-rw-r--r--tools/testing/selftests/x86/nx_stack.c212
21 files changed, 404 insertions, 95 deletions
diff --git a/Documentation/arch/x86/amd-memory-encryption.rst b/Documentation/arch/x86/amd-memory-encryption.rst
index 934310ce7258..07caa8fff852 100644
--- a/Documentation/arch/x86/amd-memory-encryption.rst
+++ b/Documentation/arch/x86/amd-memory-encryption.rst
@@ -130,4 +130,4 @@ SNP feature support.
More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
-[1] https://www.amd.com/system/files/TechDocs/40332.pdf
+[1] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf
diff --git a/Documentation/arch/x86/iommu.rst b/Documentation/arch/x86/iommu.rst
index 42c7a6faa39a..41fbadfe2221 100644
--- a/Documentation/arch/x86/iommu.rst
+++ b/Documentation/arch/x86/iommu.rst
@@ -5,7 +5,7 @@ x86 IOMMU Support
The architecture specs can be obtained from the below locations.
- Intel: http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
-- AMD: https://www.amd.com/system/files/TechDocs/48882_IOMMU.pdf
+- AMD: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_3_07_PUB.pdf
This guide gives a quick cheat sheet for some basic understanding.
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 7816ff55025d..454acd7a2daf 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -25,7 +25,7 @@
#include "error.h"
#include "../msr.h"
-struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
+static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb;
/*
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 1d6b863c42b0..2e1be592c220 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -119,7 +119,7 @@ static void __noreturn tdx_panic(const char *msg)
} message;
/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
- strncpy(message.str, msg, 64);
+ strtomem_pad(message.str, msg, '\0');
args.r8 = message.r8;
args.r9 = message.r9;
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 473b16d73b47..359ada486fa9 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -19,8 +19,10 @@
#ifdef CONFIG_X86_MEM_ENCRYPT
void __init mem_encrypt_init(void);
+void __init mem_encrypt_setup_arch(void);
#else
static inline void mem_encrypt_init(void) { }
+static inline void __init mem_encrypt_setup_arch(void) { }
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data);
void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void);
-void __init sev_setup_arch(void);
void __init sme_encrypt_kernel(struct boot_params *bp);
void __init sme_enable(struct boot_params *bp);
@@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { }
-static inline void __init sev_setup_arch(void) { }
static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index e3bae2b60a0d..ef2844d69173 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -12,13 +12,6 @@
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-/*
- * Too small node sizes may confuse the VM badly. Usually they
- * result from BIOS bugs. So dont recognize nodes as standalone
- * NUMA entities that have less than this amount of RAM listed:
- */
-#define NODE_MIN_SIZE (4*1024*1024)
-
extern int numa_off;
/*
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e02b179ec659..57bab91bbf50 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1716,6 +1716,14 @@ static inline bool pud_user_accessible_page(pud_t pud)
}
#endif
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e30b27b1ebe..621ff0803b04 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -724,14 +724,6 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
-#ifdef CONFIG_X86_SGX
-int arch_memory_failure(unsigned long pfn, int flags);
-#define arch_memory_failure arch_memory_failure
-
-bool arch_is_platform_page(u64 paddr);
-#define arch_is_platform_page arch_is_platform_page
-#endif
-
extern bool gds_ucode_mitigated(void);
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 64df897c0ee3..1be13b2dfe8b 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start);
#define phys_to_target_node phys_to_target_node
extern int memory_add_physaddr_to_nid(u64 start);
#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+extern int numa_fill_memblks(u64 start, u64 end);
+#define numa_fill_memblks numa_fill_memblks
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0a3ea787ac51..3f8de9bce61a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1115,18 +1115,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
+ bool vp_bits_from_cpuid = true;
- if (c->extended_cpuid_level >= 0x80000008) {
+ if (!cpu_has(c, X86_FEATURE_CPUID) ||
+ (c->extended_cpuid_level < 0x80000008))
+ vp_bits_from_cpuid = false;
+
+ if (vp_bits_from_cpuid) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+ } else {
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+ } else {
+ c->x86_clflush_size = 32;
+ c->x86_virt_bits = 32;
+ c->x86_phys_bits = 32;
+
+ if (cpu_has(c, X86_FEATURE_PAE) ||
+ cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+ }
}
-#ifdef CONFIG_X86_32
- else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
- c->x86_phys_bits = 36;
-#endif
c->x86_cache_bits = c->x86_phys_bits;
+ c->x86_cache_alignment = c->x86_clflush_size;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1580,17 +1596,6 @@ static void __init cpu_parse_early_param(void)
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- c->x86_clflush_size = 64;
- c->x86_phys_bits = 36;
- c->x86_virt_bits = 48;
-#else
- c->x86_clflush_size = 32;
- c->x86_phys_bits = 32;
- c->x86_virt_bits = 32;
-#endif
- c->x86_cache_alignment = c->x86_clflush_size;
-
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
@@ -1602,7 +1607,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
- get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
cpu_parse_early_param();
@@ -1618,6 +1622,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_CPUID);
}
+ get_cpu_address_sizes(c);
+
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
cpu_set_bug_bits(c);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6bbbef6586d1..086a2c3aaaa0 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -179,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movl $0, %ecx
#endif
- /* Enable PAE mode, PGE and LA57 */
- orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ /* Enable PAE mode, PSE, PGE and LA57 */
+ orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3e05ecf8cf8d..ccd3ad29a1dc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1120,7 +1120,7 @@ void __init setup_arch(char **cmdline_p)
* Needs to run after memblock setup because it needs the physical
* memory size.
*/
- sev_setup_arch();
+ mem_encrypt_setup_arch();
efi_fake_memmap();
efi_find_mirror();
diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c
index 5a53c2cc169c..6993f026adec 100644
--- a/arch/x86/mm/maccess.c
+++ b/arch/x86/mm/maccess.c
@@ -9,12 +9,21 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
unsigned long vaddr = (unsigned long)unsafe_src;
/*
- * Range covering the highest possible canonical userspace address
- * as well as non-canonical address range. For the canonical range
- * we also need to include the userspace guard page.
+ * Do not allow userspace addresses. This disallows
+ * normal userspace and the userspace guard page:
*/
- return vaddr >= TASK_SIZE_MAX + PAGE_SIZE &&
- __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
+ if (vaddr < TASK_SIZE_MAX + PAGE_SIZE)
+ return false;
+
+ /*
+ * Allow everything during early boot before 'x86_virt_bits'
+ * is initialized. Needed for instruction decoding in early
+ * exception handlers.
+ */
+ if (!boot_cpu_data.x86_virt_bits)
+ return true;
+
+ return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
}
#else
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 9f27e14e185f..c290c55b632b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -12,6 +12,7 @@
#include <linux/swiotlb.h>
#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
+#include <linux/virtio_anchor.h>
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@@ -86,3 +87,36 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info();
}
+
+void __init mem_encrypt_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * For SEV and TDX, all DMA has to occur via shared/unencrypted pages.
+ * Kernel uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB using a percentage of guest
+ * memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer
+ * memory is allocated from low memory, ensure that the adjusted size
+ * is within the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+
+ /* Set restricted memory access for virtio. */
+ virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 6faea41e99b6..a68f2dda0948 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -19,8 +19,6 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
-#include <linux/virtio_config.h>
-#include <linux/virtio_anchor.h>
#include <linux/cc_platform.h>
#include <asm/tlbflush.h>
@@ -215,40 +213,6 @@ void __init sme_map_bootdata(char *real_mode_data)
__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
}
-void __init sev_setup_arch(void)
-{
- phys_addr_t total_mem = memblock_phys_mem_size();
- unsigned long size;
-
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- return;
-
- /*
- * For SEV, all DMA has to occur via shared/unencrypted pages.
- * SEV uses SWIOTLB to make this happen without changing device
- * drivers. However, depending on the workload being run, the
- * default 64MB of SWIOTLB may not be enough and SWIOTLB may
- * run out of buffers for DMA, resulting in I/O errors and/or
- * performance degradation especially with high I/O workloads.
- *
- * Adjust the default size of SWIOTLB for SEV guests using
- * a percentage of guest memory for SWIOTLB buffers.
- * Also, as the SWIOTLB bounce buffer memory is allocated
- * from low memory, ensure that the adjusted size is within
- * the limits of low available memory.
- *
- * The percentage of guest memory used here for SWIOTLB buffers
- * is more of an approximation of the static adjustment which
- * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
- */
- size = total_mem * 6 / 100;
- size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
- swiotlb_adjust_size(size);
-
- /* Set restricted memory access for virtio. */
- virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
-}
-
static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
{
unsigned long pfn = 0;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c79f12e449ea..4a17f663f949 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -12,6 +12,7 @@
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <linux/topology.h>
+#include <linux/sort.h>
#include <asm/e820/api.h>
#include <asm/proto.h>
@@ -602,13 +603,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
if (start >= end)
continue;
- /*
- * Don't confuse VM with a node that doesn't have the
- * minimum amount of memory:
- */
- if (end && (end - start) < NODE_MIN_SIZE)
- continue;
-
alloc_node_data(nid);
}
@@ -964,4 +958,83 @@ int memory_add_physaddr_to_nid(u64 start)
return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+static int __init cmp_memblk(const void *a, const void *b)
+{
+ const struct numa_memblk *ma = *(const struct numa_memblk **)a;
+ const struct numa_memblk *mb = *(const struct numa_memblk **)b;
+
+ return ma->start - mb->start;
+}
+
+static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
+
+/**
+ * numa_fill_memblks - Fill gaps in numa_meminfo memblks
+ * @start: address to begin fill
+ * @end: address to end fill
+ *
+ * Find and extend numa_meminfo memblks to cover the @start-@end
+ * physical address range, such that the first memblk includes
+ * @start, the last memblk includes @end, and any gaps in between
+ * are filled.
+ *
+ * RETURNS:
+ * 0 : Success
+ * NUMA_NO_MEMBLK : No memblk exists in @start-@end range
+ */
+
+int __init numa_fill_memblks(u64 start, u64 end)
+{
+ struct numa_memblk **blk = &numa_memblk_list[0];
+ struct numa_meminfo *mi = &numa_meminfo;
+ int count = 0;
+ u64 prev_end;
+
+ /*
+ * Create a list of pointers to numa_meminfo memblks that
+ * overlap start, end. Exclude (start == bi->end) since
+ * end addresses in both a CFMWS range and a memblk range
+ * are exclusive.
+ *
+ * This list of pointers is used to make in-place changes
+ * that fill out the numa_meminfo memblks.
+ */
+ for (int i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ if (start < bi->end && end >= bi->start) {
+ blk[count] = &mi->blk[i];
+ count++;
+ }
+ }
+ if (!count)
+ return NUMA_NO_MEMBLK;
+
+ /* Sort the list of pointers in memblk->start order */
+ sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
+
+ /* Make sure the first/last memblks include start/end */
+ blk[0]->start = min(blk[0]->start, start);
+ blk[count - 1]->end = max(blk[count - 1]->end, end);
+
+ /*
+ * Fill any gaps by tracking the previous memblks
+ * end address and backfilling to it if needed.
+ */
+ prev_end = blk[0]->end;
+ for (int i = 1; i < count; i++) {
+ struct numa_memblk *curr = blk[i];
+
+ if (prev_end >= curr->start) {
+ if (prev_end < curr->end)
+ prev_end = curr->end;
+ } else {
+ curr->start = prev_end;
+ prev_end = curr->end;
+ }
+ }
+ return 0;
+}
+
#endif
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 1f4fc5f8a819..12f330b0eac0 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -310,11 +310,16 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
start = cfmws->base_hpa;
end = cfmws->base_hpa + cfmws->window_size;
- /* Skip if the SRAT already described the NUMA details for this HPA */
- node = phys_to_target_node(start);
- if (node != NUMA_NO_NODE)
+ /*
+ * The SRAT may have already described NUMA details for all,
+ * or a portion of, this CFMWS HPA range. Extend the memblks
+ * found for any portion of the window to cover the entire
+ * window.
+ */
+ if (!numa_fill_memblks(start, end))
return 0;
+ /* No SRAT description. Create a new node. */
node = acpi_map_pxm_to_node(*fake_pxm);
if (node == NUMA_NO_NODE) {
diff --git a/include/linux/numa.h b/include/linux/numa.h
index fb30a42f0700..a904861de800 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -12,6 +12,7 @@
#define MAX_NUMNODES (1 << NODES_SHIFT)
#define NUMA_NO_NODE (-1)
+#define NUMA_NO_MEMBLK (-1)
/* optionally keep NUMA memory info available post init */
#ifdef CONFIG_NUMA_KEEP_MEMINFO
@@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start)
return 0;
}
#endif
+#ifndef numa_fill_memblks
+static inline int __init numa_fill_memblks(u64 start, u64 end)
+{
+ return NUMA_NO_MEMBLK;
+}
+#endif
#else /* !CONFIG_NUMA */
static inline int numa_nearest_node(int node, unsigned int state)
{
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 7e8c937627dd..0b872c0a42d2 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -14,6 +14,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
syscall_arg_fault fsgsbase_restore sigaltstack
+TARGETS_C_BOTHBITS += nx_stack
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
@@ -109,3 +110,6 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
# state.
$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
+
+$(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack
+$(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack
diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c
index eb0e46905bf9..8f9b06d9ce03 100644
--- a/tools/testing/selftests/x86/lam.c
+++ b/tools/testing/selftests/x86/lam.c
@@ -573,7 +573,7 @@ int do_uring(unsigned long lam)
char path[PATH_MAX] = {0};
/* get current process path */
- if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
+ if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
return 1;
int file_fd = open(path, O_RDONLY);
@@ -680,14 +680,14 @@ static int handle_execve(struct testcases *test)
perror("Fork failed.");
ret = 1;
} else if (pid == 0) {
- char path[PATH_MAX];
+ char path[PATH_MAX] = {0};
/* Set LAM mode in parent process */
if (set_lam(lam) != 0)
return 1;
/* Get current binary's path and the binary was run by execve */
- if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
+ if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
exit(-1);
/* run binary to get LAM mode and return to parent process */
diff --git a/tools/testing/selftests/x86/nx_stack.c b/tools/testing/selftests/x86/nx_stack.c
new file mode 100644
index 000000000000..ea4a4e246879
--- /dev/null
+++ b/tools/testing/selftests/x86/nx_stack.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that userspace stack is NX. Requires linking with -Wl,-z,noexecstack
+ * because I don't want to bother with PT_GNU_STACK detection.
+ *
+ * Fill the stack with INT3's and then try to execute some of them:
+ * SIGSEGV -- good, SIGTRAP -- bad.
+ *
+ * Regular stack is completely overwritten before testing.
+ * Test doesn't exit SIGSEGV handler after first fault at INT3.
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#undef NDEBUG
+#include <assert.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#define PAGE_SIZE 4096
+
+/*
+ * This is memset(rsp, 0xcc, -1); but down.
+ * It will SIGSEGV when bottom of the stack is reached.
+ * Byte-size access is important! (see rdi tweak in the signal handler).
+ */
+void make_stack1(void);
+asm(
+".pushsection .text\n"
+".globl make_stack1\n"
+".align 16\n"
+"make_stack1:\n"
+ "mov $0xcc, %al\n"
+#if defined __amd64__
+ "mov %rsp, %rdi\n"
+ "mov $-1, %rcx\n"
+#elif defined __i386__
+ "mov %esp, %edi\n"
+ "mov $-1, %ecx\n"
+#else
+#error
+#endif
+ "std\n"
+ "rep stosb\n"
+ /* unreachable */
+ "hlt\n"
+".type make_stack1,@function\n"
+".size make_stack1,.-make_stack1\n"
+".popsection\n"
+);
+
+/*
+ * memset(p, 0xcc, -1);
+ * It will SIGSEGV when top of the stack is reached.
+ */
+void make_stack2(uint64_t p);
+asm(
+".pushsection .text\n"
+".globl make_stack2\n"
+".align 16\n"
+"make_stack2:\n"
+ "mov $0xcc, %al\n"
+#if defined __amd64__
+ "mov $-1, %rcx\n"
+#elif defined __i386__
+ "mov $-1, %ecx\n"
+#else
+#error
+#endif
+ "cld\n"
+ "rep stosb\n"
+ /* unreachable */
+ "hlt\n"
+".type make_stack2,@function\n"
+".size make_stack2,.-make_stack2\n"
+".popsection\n"
+);
+
+static volatile int test_state = 0;
+static volatile unsigned long stack_min_addr;
+
+#if defined __amd64__
+#define RDI REG_RDI
+#define RIP REG_RIP
+#define RIP_STRING "rip"
+#elif defined __i386__
+#define RDI REG_EDI
+#define RIP REG_EIP
+#define RIP_STRING "eip"
+#else
+#error
+#endif
+
+static void sigsegv(int _, siginfo_t *__, void *uc_)
+{
+ /*
+ * Some Linux versions didn't clear DF before entering signal
+ * handler. make_stack1() doesn't have a chance to clear DF
+ * either so we clear it by hand here.
+ */
+ asm volatile ("cld" ::: "memory");
+
+ ucontext_t *uc = uc_;
+
+ if (test_state == 0) {
+ /* Stack is faulted and cleared from RSP to the lowest address. */
+ stack_min_addr = ++uc->uc_mcontext.gregs[RDI];
+ if (1) {
+ printf("stack min %lx\n", stack_min_addr);
+ }
+ uc->uc_mcontext.gregs[RIP] = (uintptr_t)&make_stack2;
+ test_state = 1;
+ } else if (test_state == 1) {
+ /* Stack has been cleared from top to bottom. */
+ unsigned long stack_max_addr = uc->uc_mcontext.gregs[RDI];
+ if (1) {
+ printf("stack max %lx\n", stack_max_addr);
+ }
+ /* Start faulting pages on stack and see what happens. */
+ uc->uc_mcontext.gregs[RIP] = stack_max_addr - PAGE_SIZE;
+ test_state = 2;
+ } else if (test_state == 2) {
+ /* Stack page is NX -- good, test next page. */
+ uc->uc_mcontext.gregs[RIP] -= PAGE_SIZE;
+ if (uc->uc_mcontext.gregs[RIP] == stack_min_addr) {
+ /* One more SIGSEGV and test ends. */
+ test_state = 3;
+ }
+ } else {
+ printf("PASS\tAll stack pages are NX\n");
+ _exit(EXIT_SUCCESS);
+ }
+}
+
+static void sigtrap(int _, siginfo_t *__, void *uc_)
+{
+ const ucontext_t *uc = uc_;
+ unsigned long rip = uc->uc_mcontext.gregs[RIP];
+ printf("FAIL\texecutable page on the stack: " RIP_STRING " %lx\n", rip);
+ _exit(EXIT_FAILURE);
+}
+
+int main(void)
+{
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigsegv;
+ int rv = sigaction(SIGSEGV, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigtrap;
+ int rv = sigaction(SIGTRAP, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct rlimit rlim;
+ int rv = getrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ /* Cap stack at time-honored 8 MiB value. */
+ rlim.rlim_max = rlim.rlim_cur;
+ if (rlim.rlim_max > 8 * 1024 * 1024) {
+ rlim.rlim_max = 8 * 1024 * 1024;
+ }
+ rv = setrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ }
+ {
+ /*
+ * We don't know now much stack SIGSEGV handler uses.
+ * Bump this by 1 page every time someone complains,
+ * or rewrite it in assembly.
+ */
+ const size_t len = SIGSTKSZ;
+ void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ assert(p != MAP_FAILED);
+ stack_t ss = {};
+ ss.ss_sp = p;
+ ss.ss_size = len;
+ int rv = sigaltstack(&ss, NULL);
+ assert(rv == 0);
+ }
+ make_stack1();
+ /*
+ * Unreachable, but if _this_ INT3 is ever reached, it's a bug somewhere.
+ * Fold it into main SIGTRAP pathway.
+ */
+ __builtin_trap();
+}