summaryrefslogtreecommitdiffstats
path: root/arch/x86/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/Makefile16
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S9
-rw-r--r--arch/x86/lib/bhi.S147
-rw-r--r--arch/x86/lib/clear_page_64.S9
-rw-r--r--arch/x86/lib/cmdline.c25
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S2
-rw-r--r--arch/x86/lib/copy_mc.c21
-rw-r--r--arch/x86/lib/copy_page_64.S3
-rw-r--r--arch/x86/lib/copy_user_64.S21
-rw-r--r--arch/x86/lib/copy_user_uncached_64.S2
-rw-r--r--arch/x86/lib/crc-pclmul-consts.h195
-rw-r--r--arch/x86/lib/crc-pclmul-template.S582
-rw-r--r--arch/x86/lib/crc-pclmul-template.h76
-rw-r--r--arch/x86/lib/crc-t10dif-glue.c40
-rw-r--r--arch/x86/lib/crc16-msb-pclmul.S6
-rw-r--r--arch/x86/lib/crc32-glue.c111
-rw-r--r--arch/x86/lib/crc32-pclmul.S6
-rw-r--r--arch/x86/lib/crc32c-3way.S360
-rw-r--r--arch/x86/lib/crc64-glue.c50
-rw-r--r--arch/x86/lib/crc64-pclmul.S7
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/getuser.S83
-rw-r--r--arch/x86/lib/hweight.S3
-rw-r--r--arch/x86/lib/insn.c31
-rw-r--r--arch/x86/lib/iomap_copy_64.S15
-rw-r--r--arch/x86/lib/iomem.c5
-rw-r--r--arch/x86/lib/memmove_64.S3
-rw-r--r--arch/x86/lib/memset_64.S3
-rw-r--r--arch/x86/lib/msr-reg.S3
-rw-r--r--arch/x86/lib/msr.c2
-rw-r--r--arch/x86/lib/putuser.S9
-rw-r--r--arch/x86/lib/retpoline.S3
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/lib/x86-opcode-map.txt315
34 files changed, 1988 insertions, 179 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 6da73513f026..1c50352eb49f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -38,6 +38,16 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
+obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
+crc32-x86-y := crc32-glue.o crc32-pclmul.o
+crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
+
+obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o
+crc64-x86-y := crc64-glue.o crc64-pclmul.o
+
+obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
+crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o
+
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
obj-y += iomem.o
@@ -49,16 +59,16 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += string_32.o
lib-y += memmove_32.o
lib-y += cmpxchg8b_emu.o
-ifneq ($(CONFIG_X86_CMPXCHG64),y)
+ifneq ($(CONFIG_X86_CX8),y)
lib-y += atomic64_386_32.o
endif
else
- obj-y += iomap_copy_64.o
ifneq ($(CONFIG_GENERIC_CSUM),y)
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
endif
lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o copy_user_uncached_64.o
- lib-y += cmpxchg16b_emu.o
+ lib-y += cmpxchg16b_emu.o
+ lib-y += bhi.o
endif
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 90afb488b396..b2eff07d65e4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -16,6 +16,11 @@
cmpxchg8b (\reg)
.endm
+.macro read64_nonatomic reg
+ movl (\reg), %eax
+ movl 4(\reg), %edx
+.endm
+
SYM_FUNC_START(atomic64_read_cx8)
read64 %ecx
RET
@@ -51,7 +56,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
movl %edx, %edi
movl %ecx, %ebp
- read64 %ecx
+ read64_nonatomic %ecx
1:
movl %eax, %ebx
movl %edx, %ecx
@@ -79,7 +84,7 @@ addsub_return sub sub sbb
SYM_FUNC_START(atomic64_\func\()_return_cx8)
pushl %ebx
- read64 %esi
+ read64_nonatomic %esi
1:
movl %eax, %ebx
movl %edx, %ecx
diff --git a/arch/x86/lib/bhi.S b/arch/x86/lib/bhi.S
new file mode 100644
index 000000000000..58891681261b
--- /dev/null
+++ b/arch/x86/lib/bhi.S
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/unwind_hints.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * Notably, the FineIBT preamble calling these will have ZF set and r10 zero.
+ *
+ * The very last element is in fact larger than 32 bytes, but since its the
+ * last element, this does not matter,
+ *
+ * There are 2 #UD sites, located between 0,1-2,3 and 4,5-6,7 such that they
+ * can be reached using Jcc.d8, these elements (1 and 5) have sufficiently
+ * big alignment holes for this to not stagger the array.
+ */
+
+.pushsection .noinstr.text, "ax"
+
+ .align 32
+SYM_CODE_START(__bhi_args)
+
+#ifdef CONFIG_FINEIBT_BHI
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %r10, %rdi
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 8
+ ANNOTATE_REACHABLE
+.Lud_1: ud2
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ cmovne %r10, %r8
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 8
+ ANNOTATE_REACHABLE
+.Lud_2: ud2
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_6, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ cmovne %r10, %r8
+ cmovne %r10, %r9
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_7, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %r10, %rdi
+ cmovne %r10, %rsi
+ cmovne %r10, %rdx
+ cmovne %r10, %rcx
+ cmovne %r10, %r8
+ cmovne %r10, %r9
+ cmovne %r10, %rsp
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+#endif /* CONFIG_FINEIBT_BHI */
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ nop /* Work around toolchain+objtool quirk */
+SYM_CODE_END(__bhi_args)
+
+.popsection
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 2760a15fbc00..a508e4a8c66a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <linux/objtool.h>
#include <asm/asm.h>
/*
@@ -14,7 +16,7 @@
* Zero a page.
* %rdi - page
*/
-SYM_FUNC_START(clear_page_rep)
+SYM_TYPED_FUNC_START(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
@@ -22,7 +24,7 @@ SYM_FUNC_START(clear_page_rep)
SYM_FUNC_END(clear_page_rep)
EXPORT_SYMBOL_GPL(clear_page_rep)
-SYM_FUNC_START(clear_page_orig)
+SYM_TYPED_FUNC_START(clear_page_orig)
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -44,7 +46,7 @@ SYM_FUNC_START(clear_page_orig)
SYM_FUNC_END(clear_page_orig)
EXPORT_SYMBOL_GPL(clear_page_orig)
-SYM_FUNC_START(clear_page_erms)
+SYM_TYPED_FUNC_START(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
@@ -63,6 +65,7 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* rcx: uncleared bytes or 0 if successful.
*/
SYM_FUNC_START(rep_stos_alternative)
+ ANNOTATE_NOENDBR
cmpq $64,%rcx
jae .Lunrolled
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 80570eb3c89b..c65cd5550454 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -6,8 +6,10 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/ctype.h>
+
#include <asm/setup.h>
#include <asm/cmdline.h>
+#include <asm/bug.h>
static inline int myisspace(u8 c)
{
@@ -205,12 +207,29 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size,
int cmdline_find_option_bool(const char *cmdline, const char *option)
{
- return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ int ret;
+
+ ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option);
+
+ return ret;
}
int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
int bufsize)
{
- return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
- buffer, bufsize);
+ int ret;
+
+ ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+
+ return ret;
}
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index 1c96be769adc..d4bb24347ff8 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,7 +7,7 @@
.text
-#ifndef CONFIG_X86_CMPXCHG64
+#ifndef CONFIG_X86_CX8
/*
* Emulate 'cmpxchg8b (%esi)' on UP
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index 6e8b7e600def..97e88e58567b 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -4,6 +4,7 @@
#include <linux/jump_label.h>
#include <linux/uaccess.h>
#include <linux/export.h>
+#include <linux/instrumented.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -61,10 +62,20 @@ unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned
*/
unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len)
{
- if (copy_mc_fragile_enabled)
- return copy_mc_fragile(dst, src, len);
- if (static_cpu_has(X86_FEATURE_ERMS))
- return copy_mc_enhanced_fast_string(dst, src, len);
+ unsigned long ret;
+
+ if (copy_mc_fragile_enabled) {
+ instrument_memcpy_before(dst, src, len);
+ ret = copy_mc_fragile(dst, src, len);
+ instrument_memcpy_after(dst, src, len, ret);
+ return ret;
+ }
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ instrument_memcpy_before(dst, src, len);
+ ret = copy_mc_enhanced_fast_string(dst, src, len);
+ instrument_memcpy_after(dst, src, len, ret);
+ return ret;
+ }
memcpy(dst, src, len);
return 0;
}
@@ -75,6 +86,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un
unsigned long ret;
if (copy_mc_fragile_enabled) {
+ instrument_copy_to_user(dst, src, len);
__uaccess_begin();
ret = copy_mc_fragile((__force void *)dst, src, len);
__uaccess_end();
@@ -82,6 +94,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un
}
if (static_cpu_has(X86_FEATURE_ERMS)) {
+ instrument_copy_to_user(dst, src, len);
__uaccess_begin();
ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len);
__uaccess_end();
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index d6ae793d08fa..d8e87fedc20d 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -3,6 +3,7 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -13,7 +14,7 @@
* prefetch distance based on SMP/UP.
*/
ALIGN
-SYM_FUNC_START(copy_page)
+SYM_TYPED_FUNC_START(copy_page)
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index fc9fb5d06174..06296eb69fd4 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,6 +8,8 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <linux/objtool.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <asm/asm.h>
@@ -30,6 +32,7 @@
* it simpler for us, we can clobber rsi/rdi and rax freely.
*/
SYM_FUNC_START(rep_movs_alternative)
+ ANNOTATE_NOENDBR
cmpq $64,%rcx
jae .Llarge
@@ -74,6 +77,24 @@ SYM_FUNC_START(rep_movs_alternative)
_ASM_EXTABLE_UA( 0b, 1b)
.Llarge_movsq:
+ /* Do the first possibly unaligned word */
+0: movq (%rsi),%rax
+1: movq %rax,(%rdi)
+
+ _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
+
+ /* What would be the offset to the aligned destination? */
+ leaq 8(%rdi),%rax
+ andq $-8,%rax
+ subq %rdi,%rax
+
+ /* .. and update pointers and count to match */
+ addq %rax,%rdi
+ addq %rax,%rsi
+ subq %rax,%rcx
+
+ /* make %rcx contain the number of words, %rax the remainder */
movq %rcx,%rax
shrq $3,%rcx
andl $7,%eax
diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S
index 2918e36eece2..18350b343c2a 100644
--- a/arch/x86/lib/copy_user_uncached_64.S
+++ b/arch/x86/lib/copy_user_uncached_64.S
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/asm.h>
/*
@@ -27,6 +28,7 @@
* rax uncopied bytes or 0 if successful.
*/
SYM_FUNC_START(__copy_user_nocache)
+ ANNOTATE_NOENDBR
/* If destination is not 7-byte aligned, we'll have to align it */
testb $7,%dil
jne .Lalign
diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
new file mode 100644
index 000000000000..fcc63c064333
--- /dev/null
+++ b/arch/x86/lib/crc-pclmul-consts.h
@@ -0,0 +1,195 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CRC constants generated by:
+ *
+ * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *
+ * Do not edit manually.
+ */
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-16 using
+ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+ u8 bswap_mask[16];
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
+ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+ .fold_across_2048_bits_consts = {
+ 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
+ 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
+ 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
+ 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
+ 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
+ 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
+ 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
+ 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
+ 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
+ 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
+ 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
+ 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
+ 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
+ },
+};
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x^1 + x^0
+ */
+static const struct {
+ u8 bswap_mask[16];
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
+ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+ .fold_across_2048_bits_consts = {
+ 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
+ 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
+ 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
+ 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
+ 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
+ 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
+ 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
+ * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
+ * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
+ * x^4 + x^3 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
+ 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
+ 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
+ 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
+ },
+ .fold_across_256_bits_consts = {
+ 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
+ 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
+ },
+ .fold_across_128_bits_consts = {
+ 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
+ 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
+ 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
+ },
+};
diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
new file mode 100644
index 000000000000..ae0b6144c503
--- /dev/null
+++ b/arch/x86/lib/crc-pclmul-template.S
@@ -0,0 +1,582 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// Template to generate [V]PCLMULQDQ-based CRC functions for x86
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+// Offsets within the generated constants table
+.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
+.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0
+.set OFFSETOF_SHUF_TABLE, 1*16
+.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16
+
+// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
+// corresponding non-VEX instruction plus any needed moves. The supported
+// instruction formats are:
+//
+// - Two-arg [src, dst], where the non-VEX format is the same.
+// - Three-arg [src1, src2, dst] where the non-VEX format is
+// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
+//
+// \insn gives the instruction without a "v" prefix and including any immediate
+// argument if needed to make the instruction follow one of the above formats.
+// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
+// it first; this is needed when \arg1 is an unaligned mem operand.
+.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
+.if AVX_LEVEL == 0
+ // VEX not allowed. Emulate it.
+ .ifnb \arg3 // Three-arg [src1, src2, dst]
+ .ifc "\arg2", "\arg3" // src2 == dst?
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ \insn \unaligned_mem_tmp, \arg3
+ .else
+ \insn \arg1, \arg3
+ .endif
+ .else // src2 != dst
+ .ifc "\arg1", "\arg3"
+ .error "Can't have src1 == dst when src2 != dst"
+ .endif
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ movdqa \arg2, \arg3
+ \insn \unaligned_mem_tmp, \arg3
+ .else
+ movdqa \arg2, \arg3
+ \insn \arg1, \arg3
+ .endif
+ .endif
+ .else // Two-arg [src, dst]
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ \insn \unaligned_mem_tmp, \arg2
+ .else
+ \insn \arg1, \arg2
+ .endif
+ .endif
+.else
+ // VEX is allowed. Emit the desired instruction directly.
+ .ifnb \arg3
+ v\insn \arg1, \arg2, \arg3
+ .else
+ v\insn \arg1, \arg2
+ .endif
+.endif
+.endm
+
+// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
+// register of length VL.
+.macro _vbroadcast src, dst
+.if VL == 16
+ _cond_vex movdqa, \src, \dst
+.elseif VL == 32
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
+// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
+.macro _load_data vl, src, bswap_mask, dst
+.if \vl < 64
+ _cond_vex movdqu, "\src", \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.if !LSB_CRC
+ _cond_vex pshufb, \bswap_mask, \dst, \dst
+.endif
+.endm
+
+.macro _prepare_v0 vl, v0, v1, bswap_mask
+.if LSB_CRC
+ .if \vl < 64
+ _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
+ .else
+ vpxorq (BUF), \v0, \v0
+ .endif
+.else
+ _load_data \vl, (BUF), \bswap_mask, \v1
+ .if \vl < 64
+ _cond_vex pxor, \v1, \v0, \v0
+ .else
+ vpxorq \v1, \v0, \v0
+ .endif
+.endif
+.endm
+
+// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
+// msb-first order or the physically high qword for lsb-first order
+#define LO64_TERMS 0
+
+// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
+// qword for msb-first order or the physically low qword for lsb-first order
+#define HI64_TERMS 1
+
+// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
+// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
+.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
+ _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
+ \src1, \src2, \dst
+.endm
+
+// Fold \acc into \data and store the result back into \acc. \data can be an
+// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
+// byte-reflection is needed; otherwise it must be a vector register. \consts
+// is a vector register containing the needed fold constants, and \tmp is a
+// temporary vector register. All arguments must be the same length.
+.macro _fold_vec acc, data, consts, tmp
+ _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
+ _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
+.if AVX_LEVEL <= 2
+ _cond_vex pxor, \data, \tmp, \tmp
+ _cond_vex pxor, \tmp, \acc, \acc
+.else
+ vpternlogq $0x96, \data, \tmp, \acc
+.endif
+.endm
+
+// Fold \acc into \data and store the result back into \acc. \data is an
+// unaligned mem operand, \consts is a vector register containing the needed
+// fold constants, \bswap_mask is a vector register containing the
+// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
+// temporary vector registers. All arguments must have length \vl.
+.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
+.if AVX_LEVEL == 0 || !LSB_CRC
+ _load_data \vl, \data, \bswap_mask, \tmp1
+ _fold_vec \acc, \tmp1, \consts, \tmp2
+.else
+ _fold_vec \acc, \data, \consts, \tmp1
+.endif
+.endm
+
+// Load the constants for folding across 2**i vectors of length VL at a time
+// into all 128-bit lanes of the vector register CONSTS.
+.macro _load_vec_folding_consts i
+ _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
+ CONSTS
+.endm
+
+// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
+// the result back into \v0. If the remaining length mod \vl is nonzero, also
+// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
+// \consts must be a register of length \vl containing the fold constants.
+.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
+ _fold_vec \v0, \v1, \consts, \tmp1
+ test $\vl, LEN8
+ jz .Lfold_vec_final_done\@
+ _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
+ add $\vl, BUF
+.Lfold_vec_final_done\@:
+.endm
+
+// This macro generates the body of a CRC function with the following prototype:
+//
+// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
+//
+// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
+// |buf| is the data to checksum. |len| is the data length in bytes, which must
+// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts
+// field of the constants struct that was generated for the chosen CRC variant.
+//
+// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
+// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
+// the file is compiled in i386 mode, then the maximum supported value is 32.
+//
+// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
+// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
+// if the CRC processes the most significant bit of each byte first, i.e. maps
+// bit0 to x^0, bit1 to x^1, bit7 to x^7.
+//
+// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
+//
+// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
+// 512 for AVX512.
+//
+// If \vl == 16 && \avx_level == 0, the generated code requires:
+// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
+//
+// If \vl == 32 && \avx_level == 2, the generated code requires:
+// VPCLMULQDQ && AVX2.
+//
+// If \vl == 64 && \avx_level == 512, the generated code requires:
+// VPCLMULQDQ && AVX512BW && AVX512VL.
+//
+// Other \vl and \avx_level combinations are either not supported or not useful.
+.macro _crc_pclmul n, lsb_crc, vl, avx_level
+ .set LSB_CRC, \lsb_crc
+ .set VL, \vl
+ .set AVX_LEVEL, \avx_level
+
+ // Define aliases for the xmm, ymm, or zmm registers according to VL.
+.irp i, 0,1,2,3,4,5,6,7
+ .if VL == 16
+ .set V\i, %xmm\i
+ .set LOG2_VL, 4
+ .elseif VL == 32
+ .set V\i, %ymm\i
+ .set LOG2_VL, 5
+ .elseif VL == 64
+ .set V\i, %zmm\i
+ .set LOG2_VL, 6
+ .else
+ .error "Unsupported vector length"
+ .endif
+.endr
+ // Define aliases for the function parameters.
+ // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
+ // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
+ // when crc_t is shorter than u64.
+#ifdef __x86_64__
+.if \n <= 32
+ .set CRC, %edi
+.else
+ .set CRC, %rdi
+.endif
+ .set BUF, %rsi
+ .set LEN, %rdx
+ .set LEN32, %edx
+ .set LEN8, %dl
+ .set CONSTS_PTR, %rcx
+#else
+ // 32-bit support, assuming -mregparm=3 and not including support for
+ // CRC-64 (which would use both eax and edx to pass the crc parameter).
+ .set CRC, %eax
+ .set BUF, %edx
+ .set LEN, %ecx
+ .set LEN32, %ecx
+ .set LEN8, %cl
+ .set CONSTS_PTR, %ebx // Passed on stack
+#endif
+
+ // Define aliases for some local variables. V0-V5 are used without
+ // aliases (for accumulators, data, temporary values, etc). Staying
+ // within the first 8 vector registers keeps the code 32-bit SSE
+ // compatible and reduces the size of 64-bit SSE code slightly.
+ .set BSWAP_MASK, V6
+ .set BSWAP_MASK_YMM, %ymm6
+ .set BSWAP_MASK_XMM, %xmm6
+ .set CONSTS, V7
+ .set CONSTS_YMM, %ymm7
+ .set CONSTS_XMM, %xmm7
+
+ // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
+ // functions generated by this macro are called only by static_call.
+ ANNOTATE_NOENDBR
+
+#ifdef __i386__
+ push CONSTS_PTR
+ mov 8(%esp), CONSTS_PTR
+#endif
+
+ // Create a 128-bit vector that contains the initial CRC in the end
+ // representing the high-order polynomial coefficients, and the rest 0.
+ // If the CRC is msb-first, also load the byte-reflection table.
+.if \n <= 32
+ _cond_vex movd, CRC, %xmm0
+.else
+ _cond_vex movq, CRC, %xmm0
+.endif
+.if !LSB_CRC
+ _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
+ _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
+.endif
+
+ // Load the first vector of data and XOR the initial CRC into the
+ // appropriate end of the first 128-bit lane of data. If LEN < VL, then
+ // use a short vector and jump ahead to the final reduction. (LEN >= 16
+ // is guaranteed here but not necessarily LEN >= VL.)
+.if VL >= 32
+ cmp $VL, LEN
+ jae .Lat_least_1vec\@
+ .if VL == 64
+ cmp $32, LEN32
+ jb .Lless_than_32bytes\@
+ _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM
+ add $32, BUF
+ jmp .Lreduce_256bits_to_128bits\@
+.Lless_than_32bytes\@:
+ .endif
+ _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM
+ add $16, BUF
+ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+ jmp .Lcheck_for_partial_block\@
+.Lat_least_1vec\@:
+.endif
+ _prepare_v0 VL, V0, V1, BSWAP_MASK
+
+ // Handle VL <= LEN < 4*VL.
+ cmp $4*VL-1, LEN
+ ja .Lat_least_4vecs\@
+ add $VL, BUF
+ // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
+ // If VL==16 then load fold_across_128_bits_consts first, as the final
+ // reduction depends on it and it won't be loaded anywhere else.
+ cmp $2*VL-1, LEN32
+.if VL == 16
+ _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+.endif
+ jbe .Lreduce_1vec_to_128bits\@
+ // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to
+ // the reduction from 2 vectors.
+ _load_data VL, (BUF), BSWAP_MASK, V1
+ add $VL, BUF
+ jmp .Lreduce_2vecs_to_1\@
+
+.Lat_least_4vecs\@:
+ // Load 3 more vectors of data.
+ _load_data VL, 1*VL(BUF), BSWAP_MASK, V1
+ _load_data VL, 2*VL(BUF), BSWAP_MASK, V2
+ _load_data VL, 3*VL(BUF), BSWAP_MASK, V3
+ sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
+ add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
+
+ // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
+ // 4 vectors of data and write the result back to V0-V3.
+ cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
+ jbe .Lreduce_4vecs_to_2\@
+ _load_vec_folding_consts 2
+.Lfold_4vecs_loop\@:
+ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ sub $-4*VL, BUF
+ add $-4*VL, LEN
+ cmp $4*VL-1, LEN
+ ja .Lfold_4vecs_loop\@
+
+ // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
+ // two more vectors of data from BUF, if at least that much remains.
+.Lreduce_4vecs_to_2\@:
+ _load_vec_folding_consts 1
+ _fold_vec V0, V2, CONSTS, V4
+ _fold_vec V1, V3, CONSTS, V4
+ test $2*VL, LEN8
+ jz .Lreduce_2vecs_to_1\@
+ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ sub $-2*VL, BUF
+
+ // Fold V0 into V1 and write the result back to V0. Then fold one more
+ // vector of data from BUF, if at least that much remains.
+.Lreduce_2vecs_to_1\@:
+ _load_vec_folding_consts 0
+ _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
+
+.Lreduce_1vec_to_128bits\@:
+.if VL == 64
+ // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
+ // data from BUF, if at least that much remains.
+ vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
+ vextracti64x4 $1, %zmm0, %ymm1
+ _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
+.Lreduce_256bits_to_128bits\@:
+.endif
+.if VL >= 32
+ // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
+ // data from BUF, if at least that much remains.
+ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+ vextracti128 $1, %ymm0, %xmm1
+ _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
+.Lcheck_for_partial_block\@:
+.endif
+ and $15, LEN32
+ jz .Lreduce_128bits_to_crc\@
+
+ // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
+ // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
+ // and B is the polynomial of the remaining LEN data bytes. To reduce
+ // this to 128 bits without needing fold constants for each possible
+ // LEN, rearrange this expression into C1*(x^128) + C2, where
+ // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
+ // Then fold C1 into C2, which is just another fold across 128 bits.
+
+.if !LSB_CRC || AVX_LEVEL == 0
+ // Load the last 16 data bytes. Note that originally LEN was >= 16.
+ _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
+.endif // Else will use vpblendvb mem operand later.
+.if !LSB_CRC
+ neg LEN // Needed for indexing shuf_table
+.endif
+
+ // tmp = A*x^(8*LEN) mod x^128
+ // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
+ // i.e. right-shift by LEN bytes.
+ // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
+ // i.e. left-shift by LEN bytes.
+ _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
+ _cond_vex pshufb, %xmm3, %xmm0, %xmm1
+
+ // C1 = floor(A / x^(128 - 8*LEN))
+ // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
+ // i.e. left-shift by 16-LEN bytes.
+ // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
+ // i.e. right-shift by 16-LEN bytes.
+ _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
+ %xmm0, %xmm0, unaligned_mem_tmp=%xmm4
+
+ // C2 = tmp + B. This is just a blend of tmp with the last 16 data
+ // bytes (reflected if msb-first). The blend mask is the shuffle table
+ // that was used to create tmp. 0 selects tmp, and 1 last16databytes.
+.if AVX_LEVEL == 0
+ movdqa %xmm0, %xmm4
+ movdqa %xmm3, %xmm0
+ pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
+ movdqa %xmm4, %xmm0
+.elseif LSB_CRC
+ vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
+.else
+ vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+.endif
+
+ // Fold C1 into C2 and store the 128-bit result in %xmm0.
+ _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
+
+.Lreduce_128bits_to_crc\@:
+ // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
+ // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
+ // order according to LSB_CRC), and G is the CRC's generator polynomial.
+
+ // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
+ //
+ // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
+ // x^n * (%xmm0 mod x^64)
+ //
+ // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
+ //
+ // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
+ // x^64 * (%xmm0 mod x^64)
+ //
+ // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
+ // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
+ // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
+ // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
+ // (considering the extra factor of x that gets implicitly introduced by
+ // each pclmulqdq when using lsb-first order), is identical to the
+ // constant that was used earlier for folding the LO64_TERMS across 128
+ // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if LSB_CRC
+ _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
+.else
+ _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
+.endif
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
+ // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
+
+ // First step of Barrett reduction: Compute floor(t0 / G). This is the
+ // polynomial by which G needs to be multiplied to cancel out the x^n
+ // and higher terms of t0, i.e. to reduce t0 mod G. First do:
+ //
+ // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
+ //
+ // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
+ // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
+ // value that makes enough precision be carried through the calculation.
+ //
+ // The '* x' makes it so the result is floor(t1 / x^64) rather than
+ // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
+ // can be extracted much more easily in the next step. In the lsb-first
+ // case the '* x' happens implicitly. In the msb-first case it must be
+ // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
+ // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
+ // the multiplication by the x^64 term is handled using a pxor. The
+ // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
+ _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
+ _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if !LSB_CRC
+ _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
+.endif
+ // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
+
+ // Second step of Barrett reduction: Cancel out the x^n and higher terms
+ // of t0 by subtracting the needed multiple of G. This gives the CRC:
+ //
+ // crc := t0 - (G * floor(t0 / G))
+ //
+ // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
+ //
+ // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
+ //
+ // Furthermore, since the resulting CRC is n-bit, if mod x^n is
+ // explicitly applied to it then the x^n term of G makes no difference
+ // in the result and can be omitted. This helps keep the constant
+ // multiplier in 64 bits in most cases. This gives the following:
+ //
+ // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
+ // crc := (%xmm0 / x^(64-n)) mod x^n
+ //
+ // In the lsb-first case, each pclmulqdq implicitly introduces
+ // an extra factor of x, so in that case the constant that needs to be
+ // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
+ // For lsb-first CRCs where n=64, the extra factor of x cannot be as
+ // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
+ // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
+ // polynomials have nonzero x^n and x^0 terms.) It works out as: the
+ // CRC has be XORed with the physically low qword of %xmm1, representing
+ // floor(t0 / G). The most efficient way to do that is to move it to
+ // the physically high qword and use a ternlog to combine the two XORs.
+.if LSB_CRC && \n == 64
+ _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+ .if AVX_LEVEL <= 2
+ _cond_vex pxor, %xmm2, %xmm0, %xmm0
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ .else
+ vpternlogq $0x96, %xmm2, %xmm1, %xmm0
+ .endif
+ _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
+.else
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ .if \n == 8
+ _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
+ .elseif \n == 16
+ _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
+ .elseif \n == 32
+ _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
+ .else // \n == 64 && !LSB_CRC
+ _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
+ .endif
+.endif
+
+.if VL > 16
+ vzeroupper // Needed when ymm or zmm registers may have been used.
+.endif
+#ifdef __i386__
+ pop CONSTS_PTR
+#endif
+ RET
+.endm
+
+#ifdef CONFIG_AS_VPCLMULQDQ
+#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
+SYM_FUNC_START(prefix##_pclmul_sse); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
+SYM_FUNC_END(prefix##_pclmul_sse); \
+ \
+SYM_FUNC_START(prefix##_vpclmul_avx2); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
+SYM_FUNC_END(prefix##_vpclmul_avx2); \
+ \
+SYM_FUNC_START(prefix##_vpclmul_avx512); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
+SYM_FUNC_END(prefix##_vpclmul_avx512);
+#else
+#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
+SYM_FUNC_START(prefix##_pclmul_sse); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
+SYM_FUNC_END(prefix##_pclmul_sse);
+#endif // !CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h
new file mode 100644
index 000000000000..c5b3bfe11d8d
--- /dev/null
+++ b/arch/x86/lib/crc-pclmul-template.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
+ * instantiated by crc-pclmul-template.S
+ *
+ * Copyright 2025 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+#ifndef _CRC_PCLMUL_TEMPLATE_H
+#define _CRC_PCLMUL_TEMPLATE_H
+
+#include <asm/cpufeatures.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+#include "crc-pclmul-consts.h"
+
+#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
+crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
+
+#define INIT_CRC_PCLMUL(prefix) \
+do { \
+ if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \
+ boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \
+ boot_cpu_has(X86_FEATURE_AVX2) && \
+ cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \
+ if (boot_cpu_has(X86_FEATURE_AVX512BW) && \
+ boot_cpu_has(X86_FEATURE_AVX512VL) && \
+ !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \
+ cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \
+ static_call_update(prefix##_pclmul, \
+ prefix##_vpclmul_avx512); \
+ } else { \
+ static_call_update(prefix##_pclmul, \
+ prefix##_vpclmul_avx2); \
+ } \
+ } \
+} while (0)
+
+/*
+ * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
+ * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
+ *
+ * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
+ * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
+ * varying by CPU and factors such as which parts of the "FPU" state userspace
+ * has touched, which could result in a larger cutoff being better. Indeed, a
+ * larger cutoff is usually better for a *single* message. However, the
+ * overhead of the FPU section gets amortized if multiple FPU sections get
+ * executed before returning to userspace, since the XSAVE and XRSTOR occur only
+ * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
+ * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
+ */
+#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
+do { \
+ if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
+ crypto_simd_usable()) { \
+ const void *consts_ptr; \
+ \
+ consts_ptr = (consts).fold_across_128_bits_consts; \
+ kernel_fpu_begin(); \
+ crc = static_call(prefix##_pclmul)((crc), (p), (len), \
+ consts_ptr); \
+ kernel_fpu_end(); \
+ return crc; \
+ } \
+} while (0)
+
+#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c
new file mode 100644
index 000000000000..f89c335cde3c
--- /dev/null
+++ b/arch/x86/lib/crc-t10dif-glue.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC-T10DIF using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/module.h>
+#include "crc-pclmul-template.h"
+
+static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
+
+u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
+ have_pclmulqdq);
+ return crc_t10dif_generic(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+static int __init crc_t10dif_x86_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ INIT_CRC_PCLMUL(crc16_msb);
+ }
+ return 0;
+}
+arch_initcall(crc_t10dif_x86_init);
+
+static void __exit crc_t10dif_x86_exit(void)
+{
+}
+module_exit(crc_t10dif_x86_exit);
+
+MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S
new file mode 100644
index 000000000000..e9fe248093a8
--- /dev/null
+++ b/arch/x86/lib/crc16-msb-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
new file mode 100644
index 000000000000..e3f93b17ac3f
--- /dev/null
+++ b/arch/x86/lib/crc32-glue.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86-optimized CRC32 functions
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Copyright 2012 Xyratex Technology Limited
+ * Copyright 2024 Google LLC
+ */
+
+#include <linux/crc32.h>
+#include <linux/module.h>
+#include "crc-pclmul-template.h"
+
+static DEFINE_STATIC_KEY_FALSE(have_crc32);
+static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+
+u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
+ have_pclmulqdq);
+ return crc32_le_base(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_le_arch);
+
+#ifdef CONFIG_X86_64
+#define CRC32_INST "crc32q %1, %q0"
+#else
+#define CRC32_INST "crc32l %1, %0"
+#endif
+
+/*
+ * Use carryless multiply version of crc32c when buffer size is >= 512 to
+ * account for FPU state save/restore overhead.
+ */
+#define CRC32C_PCLMUL_BREAKEVEN 512
+
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ size_t num_longs;
+
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
+ static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ crc = crc32c_x86_3way(crc, p, len);
+ kernel_fpu_end();
+ return crc;
+ }
+
+ for (num_longs = len / sizeof(unsigned long);
+ num_longs != 0; num_longs--, p += sizeof(unsigned long))
+ asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
+
+ if (sizeof(unsigned long) > 4 && (len & 4)) {
+ asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
+ p += 4;
+ }
+ if (len & 2) {
+ asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
+ p += 2;
+ }
+ if (len & 1)
+ asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
+
+ return crc;
+}
+EXPORT_SYMBOL(crc32c_arch);
+
+u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ return crc32_be_base(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be_arch);
+
+static int __init crc32_x86_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_XMM4_2))
+ static_branch_enable(&have_crc32);
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ INIT_CRC_PCLMUL(crc32_lsb);
+ }
+ return 0;
+}
+arch_initcall(crc32_x86_init);
+
+static void __exit crc32_x86_exit(void)
+{
+}
+module_exit(crc32_x86_exit);
+
+u32 crc32_optimizations(void)
+{
+ u32 optimizations = 0;
+
+ if (static_key_enabled(&have_crc32))
+ optimizations |= CRC32C_OPTIMIZATION;
+ if (static_key_enabled(&have_pclmulqdq))
+ optimizations |= CRC32_LE_OPTIMIZATION;
+ return optimizations;
+}
+EXPORT_SYMBOL(crc32_optimizations);
+
+MODULE_DESCRIPTION("x86-optimized CRC32 functions");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
new file mode 100644
index 000000000000..f20f40fb0172
--- /dev/null
+++ b/arch/x86/lib/crc32-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S
new file mode 100644
index 000000000000..9b8770503bbc
--- /dev/null
+++ b/arch/x86/lib/crc32c-3way.S
@@ -0,0 +1,360 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
+ *
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * James Guilford <james.guilford@intel.com>
+ * David Cote <david.m.cote@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/linkage.h>
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
+#define SMALL_SIZE 200
+
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+.text
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
+#define n_misaligned_q %rcx
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
+#define chunk_bytes_q %rcx
+#define crc1 %r8
+#define crc2 %r9
+
+ cmp $SMALL_SIZE, len
+ jb .Lsmall
+
+ ################################################################
+ ## 1) ALIGN:
+ ################################################################
+ mov bufp_d, n_misaligned
+ neg n_misaligned
+ and $7, n_misaligned # calculate the misalignment amount of
+ # the address
+ je .Laligned # Skip if aligned
+
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
+ # the remaining data to an 8-byte boundary.
+.Ldo_align:
+ movq (bufp), %rax
+ add n_misaligned_q, bufp
+ sub n_misaligned_q, len
+.Lalign_loop:
+ crc32b %al, crc0 # compute crc32 of 1-byte
+ shr $8, %rax # get next byte
+ dec n_misaligned
+ jne .Lalign_loop
+.Laligned:
+
+ ################################################################
+ ## 2) PROCESS BLOCK:
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+
+.Lpartial_block:
+ # Compute floor(len / 24) to get num qwords to process from each lane.
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
+ shr $16, %eax
+ jmp .Lcrc_3lanes
+
+.Lfull_block:
+ # Processing 128 qwords from each lane.
+ mov $128, %eax
+
+ ################################################################
+ ## 3) CRC each of three lanes:
+ ################################################################
+
+.Lcrc_3lanes:
+ xor crc1,crc1
+ xor crc2,crc2
+ mov %eax, chunk_bytes
+ shl $3, chunk_bytes # num bytes to process from each lane
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
+ jl .Lcrc_3lanes_4x_done
+
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ crc32q 8(bufp), crc0_q
+ crc32q 8(bufp,chunk_bytes_q), crc1
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
+ crc32q 16(bufp), crc0_q
+ crc32q 16(bufp,chunk_bytes_q), crc1
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
+ crc32q 24(bufp), crc0_q
+ crc32q 24(bufp,chunk_bytes_q), crc1
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
+ add $32, bufp
+ sub $4, %eax
+ jge .Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+ add $4, %eax
+ jz .Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ add $8, bufp
+ dec %eax
+ jnz .Lcrc_3lanes_1x_loop
+
+.Lcrc_3lanes_last_qword:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
+
+ ################################################################
+ ## 4) Combine three results:
+ ################################################################
+
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
+
+ movq crc0_q, %xmm1 # CRC for block 1
+ pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
+
+ movq crc1, %xmm2 # CRC for block 2
+ pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
+
+ pxor %xmm2,%xmm1
+ movq %xmm1, %rax
+ xor (bufp,chunk_bytes_q,2), %rax
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
+ lea 8(bufp,chunk_bytes_q,2), bufp
+
+ ################################################################
+ ## 5) If more blocks remain, goto (2):
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+ cmp $SMALL_SIZE, len
+ jae .Lpartial_block
+
+ #######################################################################
+ ## 6) Process any remainder without interleaving:
+ #######################################################################
+.Lsmall:
+ test len_dw, len_dw
+ jz .Ldone
+ mov len_dw, %eax
+ shr $3, %eax
+ jz .Ldo_dword
+.Ldo_qwords:
+ crc32q (bufp), crc0_q
+ add $8, bufp
+ dec %eax
+ jnz .Ldo_qwords
+.Ldo_dword:
+ test $4, len_dw
+ jz .Ldo_word
+ crc32l (bufp), crc0
+ add $4, bufp
+.Ldo_word:
+ test $2, len_dw
+ jz .Ldo_byte
+ crc32w (bufp), crc0
+ add $2, bufp
+.Ldo_byte:
+ test $1, len_dw
+ jz .Ldone
+ crc32b (bufp), crc0
+.Ldone:
+ mov crc0, %eax
+ RET
+SYM_FUNC_END(crc32c_x86_3way)
+
+.section .rodata, "a", @progbits
+ ################################################################
+ ## PCLMULQDQ tables
+ ## Table is 128 entries x 2 words (8 bytes) each
+ ################################################################
+.align 8
+K_table:
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/lib/crc64-glue.c b/arch/x86/lib/crc64-glue.c
new file mode 100644
index 000000000000..b0e1b719ecbf
--- /dev/null
+++ b/arch/x86/lib/crc64-glue.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC64 using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <linux/crc64.h>
+#include <linux/module.h>
+#include "crc-pclmul-template.h"
+
+static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
+DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
+
+u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
+ have_pclmulqdq);
+ return crc64_be_generic(crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_be_arch);
+
+u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
+ have_pclmulqdq);
+ return crc64_nvme_generic(crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_nvme_arch);
+
+static int __init crc64_x86_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ INIT_CRC_PCLMUL(crc64_msb);
+ INIT_CRC_PCLMUL(crc64_lsb);
+ }
+ return 0;
+}
+arch_initcall(crc64_x86_init);
+
+static void __exit crc64_x86_exit(void)
+{
+}
+module_exit(crc64_x86_exit);
+
+MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc64-pclmul.S b/arch/x86/lib/crc64-pclmul.S
new file mode 100644
index 000000000000..4173051b5197
--- /dev/null
+++ b/arch/x86/lib/crc64-pclmul.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
+DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 23f81ca3f06b..e86eda2c0b04 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -131,7 +131,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles)
* Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu
* variable as the monitor target.
*/
- __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 10d5ed8b5990..9d5654b8a72a 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -28,20 +28,22 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/page_types.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/runtime-const.h>
#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
.macro check_range size:req
.if IS_ENABLED(CONFIG_X86_64)
- mov %rax, %rdx
- sar $63, %rdx
- or %rdx, %rax
+ RUNTIME_CONST_PTR USER_PTR_MAX, rdx
+ cmp %rdx, %rax
+ cmova %rdx, %rax
.else
cmp $TASK_SIZE_MAX-\size+1, %eax
jae .Lbad_get_user
@@ -50,11 +52,18 @@
.endif
.endm
+.macro UACCESS op src dst
+1: \op \src,\dst
+ _ASM_EXTABLE_UA(1b, __get_user_handle_exception)
+.endm
+
+
.text
SYM_FUNC_START(__get_user_1)
+ ANNOTATE_NOENDBR
check_range size=1
ASM_STAC
-1: movzbl (%_ASM_AX),%edx
+ UACCESS movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -62,9 +71,10 @@ SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)
SYM_FUNC_START(__get_user_2)
+ ANNOTATE_NOENDBR
check_range size=2
ASM_STAC
-2: movzwl (%_ASM_AX),%edx
+ UACCESS movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -72,9 +82,10 @@ SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)
SYM_FUNC_START(__get_user_4)
+ ANNOTATE_NOENDBR
check_range size=4
ASM_STAC
-3: movl (%_ASM_AX),%edx
+ UACCESS movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -82,13 +93,17 @@ SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
SYM_FUNC_START(__get_user_8)
+ ANNOTATE_NOENDBR
+#ifndef CONFIG_X86_64
+ xor %ecx,%ecx
+#endif
check_range size=8
ASM_STAC
#ifdef CONFIG_X86_64
-4: movq (%_ASM_AX),%rdx
+ UACCESS movq (%_ASM_AX),%rdx
#else
-4: movl (%_ASM_AX),%edx
-5: movl 4(%_ASM_AX),%ecx
+ UACCESS movl (%_ASM_AX),%edx
+ UACCESS movl 4(%_ASM_AX),%ecx
#endif
xor %eax,%eax
ASM_CLAC
@@ -98,9 +113,10 @@ EXPORT_SYMBOL(__get_user_8)
/* .. and the same for __get_user, just without the range checks */
SYM_FUNC_START(__get_user_nocheck_1)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
-6: movzbl (%_ASM_AX),%edx
+ UACCESS movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -108,9 +124,10 @@ SYM_FUNC_END(__get_user_nocheck_1)
EXPORT_SYMBOL(__get_user_nocheck_1)
SYM_FUNC_START(__get_user_nocheck_2)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
-7: movzwl (%_ASM_AX),%edx
+ UACCESS movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -118,9 +135,10 @@ SYM_FUNC_END(__get_user_nocheck_2)
EXPORT_SYMBOL(__get_user_nocheck_2)
SYM_FUNC_START(__get_user_nocheck_4)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
-8: movl (%_ASM_AX),%edx
+ UACCESS movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -128,13 +146,15 @@ SYM_FUNC_END(__get_user_nocheck_4)
EXPORT_SYMBOL(__get_user_nocheck_4)
SYM_FUNC_START(__get_user_nocheck_8)
+ ANNOTATE_NOENDBR
ASM_STAC
ASM_BARRIER_NOSPEC
#ifdef CONFIG_X86_64
-9: movq (%_ASM_AX),%rdx
+ UACCESS movq (%_ASM_AX),%rdx
#else
-9: movl (%_ASM_AX),%edx
-10: movl 4(%_ASM_AX),%ecx
+ xor %ecx,%ecx
+ UACCESS movl (%_ASM_AX),%edx
+ UACCESS movl 4(%_ASM_AX),%ecx
#endif
xor %eax,%eax
ASM_CLAC
@@ -150,36 +170,3 @@ SYM_CODE_START_LOCAL(__get_user_handle_exception)
mov $(-EFAULT),%_ASM_AX
RET
SYM_CODE_END(__get_user_handle_exception)
-
-#ifdef CONFIG_X86_32
-SYM_CODE_START_LOCAL(__get_user_8_handle_exception)
- ASM_CLAC
-bad_get_user_8:
- xor %edx,%edx
- xor %ecx,%ecx
- mov $(-EFAULT),%_ASM_AX
- RET
-SYM_CODE_END(__get_user_8_handle_exception)
-#endif
-
-/* get_user */
- _ASM_EXTABLE_UA(1b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(2b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(3b, __get_user_handle_exception)
-#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(4b, __get_user_handle_exception)
-#else
- _ASM_EXTABLE_UA(4b, __get_user_8_handle_exception)
- _ASM_EXTABLE_UA(5b, __get_user_8_handle_exception)
-#endif
-
-/* __get_user */
- _ASM_EXTABLE_UA(6b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(7b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(8b, __get_user_handle_exception)
-#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(9b, __get_user_handle_exception)
-#else
- _ASM_EXTABLE_UA(9b, __get_user_8_handle_exception)
- _ASM_EXTABLE_UA(10b, __get_user_8_handle_exception)
-#endif
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 774bdf3e6f0a..edbeb3ecad38 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/asm.h>
@@ -9,6 +10,7 @@
* %rdi: w
*/
SYM_FUNC_START(__sw_hweight32)
+ ANNOTATE_NOENDBR
#ifdef CONFIG_X86_64
movl %edi, %eax # w
@@ -42,6 +44,7 @@ EXPORT_SYMBOL(__sw_hweight32)
*/
#ifdef CONFIG_X86_64
SYM_FUNC_START(__sw_hweight64)
+ ANNOTATE_NOENDBR
pushq %rdi
pushq %rdx
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1bb155a0955b..6ffb931b9fb1 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -13,7 +13,7 @@
#endif
#include <asm/inat.h> /*__ignore_sync_check__ */
#include <asm/insn.h> /* __ignore_sync_check__ */
-#include <asm/unaligned.h> /* __ignore_sync_check__ */
+#include <linux/unaligned.h> /* __ignore_sync_check__ */
#include <linux/errno.h>
#include <linux/kconfig.h>
@@ -185,6 +185,17 @@ found:
if (X86_REX_W(b))
/* REX.W overrides opnd_size */
insn->opnd_bytes = 8;
+ } else if (inat_is_rex2_prefix(attr)) {
+ insn_set_byte(&insn->rex_prefix, 0, b);
+ b = peek_nbyte_next(insn_byte_t, insn, 1);
+ insn_set_byte(&insn->rex_prefix, 1, b);
+ insn->rex_prefix.nbytes = 2;
+ insn->next_byte += 2;
+ if (X86_REX_W(b))
+ /* REX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ insn->rex_prefix.got = 1;
+ goto vex_end;
}
}
insn->rex_prefix.got = 1;
@@ -283,6 +294,10 @@ int insn_get_opcode(struct insn *insn)
m = insn_vex_m_bits(insn);
p = insn_vex_p_bits(insn);
insn->attr = inat_get_avx_attribute(op, m, p);
+ /* SCALABLE EVEX uses p bits to encode operand size */
+ if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) &&
+ p == INAT_PFX_OPNDSZ)
+ insn->opnd_bytes = 2;
if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) ||
(!inat_accept_vex(insn->attr) &&
!inat_is_group(insn->attr))) {
@@ -294,6 +309,20 @@ int insn_get_opcode(struct insn *insn)
goto end;
}
+ /* Check if there is REX2 prefix or not */
+ if (insn_is_rex2(insn)) {
+ if (insn_rex2_m_bit(insn)) {
+ /* map 1 is escape 0x0f */
+ insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f);
+
+ pfx_id = insn_last_prefix_id(insn);
+ insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr);
+ } else {
+ insn->attr = inat_get_opcode_attribute(op);
+ }
+ goto end;
+ }
+
insn->attr = inat_get_opcode_attribute(op);
while (inat_is_escape(insn->attr)) {
/* Get escaped opcode */
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
deleted file mode 100644
index 6ff2f56cb0f7..000000000000
--- a/arch/x86/lib/iomap_copy_64.S
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2006 PathScale, Inc. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-/*
- * override generic version in lib/iomap_copy.c
- */
-SYM_FUNC_START(__iowrite32_copy)
- movl %edx,%ecx
- rep movsl
- RET
-SYM_FUNC_END(__iowrite32_copy)
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index e0411a3774d4..5eecb45d05d5 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -25,6 +25,9 @@ static __always_inline void rep_movs(void *to, const void *from, size_t n)
static void string_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
{
+ const void *orig_to = to;
+ const size_t orig_n = n;
+
if (unlikely(!n))
return;
@@ -39,7 +42,7 @@ static void string_memcpy_fromio(void *to, const volatile void __iomem *from, si
}
rep_movs(to, (const void *)from, n);
/* KMSAN must treat values read from devices as initialized. */
- kmsan_unpoison_memory(to, n);
+ kmsan_unpoison_memory(orig_to, orig_n);
}
static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 1b60ae81ecd8..aa1f92ee6b2e 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
*/
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -26,7 +27,7 @@
* Output:
* rax: dest
*/
-SYM_FUNC_START(__memmove)
+SYM_TYPED_FUNC_START(__memmove)
mov %rdi, %rax
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 0199d56cb479..d66b710d628f 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -3,6 +3,7 @@
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -28,7 +29,7 @@
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/
-SYM_FUNC_START(__memset)
+SYM_TYPED_FUNC_START(__memset)
ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
movq %rdi,%r9
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index ebd259f31496..5ef8494896e8 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <linux/errno.h>
+#include <linux/cfi_types.h>
#include <asm/asm.h>
#include <asm/msr.h>
@@ -12,7 +13,7 @@
*
*/
.macro op_safe_regs op
-SYM_FUNC_START(\op\()_safe_regs)
+SYM_TYPED_FUNC_START(\op\()_safe_regs)
pushq %rbx
pushq %r12
movq %rdi, %r10 /* Save pointer */
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4bf4fad5b148..5a18ecc04a6c 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -103,6 +103,7 @@ int msr_set_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, true);
}
+EXPORT_SYMBOL_GPL(msr_set_bit);
/**
* msr_clear_bit - Clear @bit in a MSR @msr.
@@ -118,6 +119,7 @@ int msr_clear_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, false);
}
+EXPORT_SYMBOL_GPL(msr_clear_bit);
#ifdef CONFIG_TRACEPOINTS
void do_trace_write_msr(unsigned int msr, u64 val, int failed)
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 975c9c18263d..46d9e9b98a61 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -13,6 +13,7 @@
*/
#include <linux/export.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/thread_info.h>
#include <asm/errno.h>
#include <asm/asm.h>
@@ -45,6 +46,7 @@
.text
SYM_FUNC_START(__put_user_1)
+ ANNOTATE_NOENDBR
check_range size=1
ASM_STAC
1: movb %al,(%_ASM_CX)
@@ -55,6 +57,7 @@ SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
SYM_FUNC_START(__put_user_nocheck_1)
+ ANNOTATE_NOENDBR
ASM_STAC
2: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -64,6 +67,7 @@ SYM_FUNC_END(__put_user_nocheck_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
SYM_FUNC_START(__put_user_2)
+ ANNOTATE_NOENDBR
check_range size=2
ASM_STAC
3: movw %ax,(%_ASM_CX)
@@ -74,6 +78,7 @@ SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
SYM_FUNC_START(__put_user_nocheck_2)
+ ANNOTATE_NOENDBR
ASM_STAC
4: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
@@ -83,6 +88,7 @@ SYM_FUNC_END(__put_user_nocheck_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
SYM_FUNC_START(__put_user_4)
+ ANNOTATE_NOENDBR
check_range size=4
ASM_STAC
5: movl %eax,(%_ASM_CX)
@@ -93,6 +99,7 @@ SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
SYM_FUNC_START(__put_user_nocheck_4)
+ ANNOTATE_NOENDBR
ASM_STAC
6: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
@@ -102,6 +109,7 @@ SYM_FUNC_END(__put_user_nocheck_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
SYM_FUNC_START(__put_user_8)
+ ANNOTATE_NOENDBR
check_range size=8
ASM_STAC
7: mov %_ASM_AX,(%_ASM_CX)
@@ -115,6 +123,7 @@ SYM_FUNC_END(__put_user_8)
EXPORT_SYMBOL(__put_user_8)
SYM_FUNC_START(__put_user_nocheck_8)
+ ANNOTATE_NOENDBR
ASM_STAC
9: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 391059b2c6fb..a26c43abd47d 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -326,6 +326,7 @@ SYM_FUNC_END(retbleed_untrain_ret)
#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
SYM_FUNC_START(entry_untrain_ret)
+ ANNOTATE_NOENDBR
ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO
SYM_FUNC_END(entry_untrain_ret)
__EXPORT_THUNK(entry_untrain_ret)
@@ -342,7 +343,7 @@ SYM_FUNC_START(call_depth_return_thunk)
* case.
*/
CALL_THUNKS_DEBUG_INC_RETS
- shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth)
+ shlq $5, PER_CPU_VAR(__x86_call_depth)
jz 1f
ANNOTATE_UNRET_SAFE
ret
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index e9251b89a9e9..654280aaa3e9 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -18,7 +18,7 @@
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/**
* clean_cache_range - write back a cache range with CLWB
- * @vaddr: virtual start address
+ * @addr: virtual start address
* @size: number of bytes to write back
*
* Write back a cache range using the CLWB (cache line write back)
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 12af572201a2..f5dd84eb55dc 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -23,6 +23,7 @@
#
# AVX Superscripts
# (ev): this opcode requires EVEX prefix.
+# (es): this opcode requires EVEX prefix and is SCALABALE.
# (evo): this opcode is changed by EVEX prefix (EVEX opcode)
# (v): this opcode requires VEX prefix.
# (v1): this opcode only supports 128bit VEX.
@@ -33,6 +34,10 @@
# - (F2): the last prefix is 0xF2
# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
+#
+# REX2 Prefix
+# - (!REX2): REX2 is not allowed
+# - (REX2): REX2 variant e.g. JMPABS
Table: one byte opcode
Referrer:
@@ -148,7 +153,7 @@ AVXcode:
65: SEG=GS (Prefix)
66: Operand-Size (Prefix)
67: Address-Size (Prefix)
-68: PUSH Iz (d64)
+68: PUSH Iz
69: IMUL Gv,Ev,Iz
6a: PUSH Ib (d64)
6b: IMUL Gv,Ev,Ib
@@ -157,22 +162,22 @@ AVXcode:
6e: OUTS/OUTSB DX,Xb
6f: OUTS/OUTSW/OUTSD DX,Xz
# 0x70 - 0x7f
-70: JO Jb
-71: JNO Jb
-72: JB/JNAE/JC Jb
-73: JNB/JAE/JNC Jb
-74: JZ/JE Jb
-75: JNZ/JNE Jb
-76: JBE/JNA Jb
-77: JNBE/JA Jb
-78: JS Jb
-79: JNS Jb
-7a: JP/JPE Jb
-7b: JNP/JPO Jb
-7c: JL/JNGE Jb
-7d: JNL/JGE Jb
-7e: JLE/JNG Jb
-7f: JNLE/JG Jb
+70: JO Jb (!REX2)
+71: JNO Jb (!REX2)
+72: JB/JNAE/JC Jb (!REX2)
+73: JNB/JAE/JNC Jb (!REX2)
+74: JZ/JE Jb (!REX2)
+75: JNZ/JNE Jb (!REX2)
+76: JBE/JNA Jb (!REX2)
+77: JNBE/JA Jb (!REX2)
+78: JS Jb (!REX2)
+79: JNS Jb (!REX2)
+7a: JP/JPE Jb (!REX2)
+7b: JNP/JPO Jb (!REX2)
+7c: JL/JNGE Jb (!REX2)
+7d: JNL/JGE Jb (!REX2)
+7e: JLE/JNG Jb (!REX2)
+7f: JNLE/JG Jb (!REX2)
# 0x80 - 0x8f
80: Grp1 Eb,Ib (1A)
81: Grp1 Ev,Iz (1A)
@@ -208,24 +213,24 @@ AVXcode:
9e: SAHF
9f: LAHF
# 0xa0 - 0xaf
-a0: MOV AL,Ob
-a1: MOV rAX,Ov
-a2: MOV Ob,AL
-a3: MOV Ov,rAX
-a4: MOVS/B Yb,Xb
-a5: MOVS/W/D/Q Yv,Xv
-a6: CMPS/B Xb,Yb
-a7: CMPS/W/D Xv,Yv
-a8: TEST AL,Ib
-a9: TEST rAX,Iz
-aa: STOS/B Yb,AL
-ab: STOS/W/D/Q Yv,rAX
-ac: LODS/B AL,Xb
-ad: LODS/W/D/Q rAX,Xv
-ae: SCAS/B AL,Yb
+a0: MOV AL,Ob (!REX2)
+a1: MOV rAX,Ov (!REX2) | JMPABS O (REX2),(o64)
+a2: MOV Ob,AL (!REX2)
+a3: MOV Ov,rAX (!REX2)
+a4: MOVS/B Yb,Xb (!REX2)
+a5: MOVS/W/D/Q Yv,Xv (!REX2)
+a6: CMPS/B Xb,Yb (!REX2)
+a7: CMPS/W/D Xv,Yv (!REX2)
+a8: TEST AL,Ib (!REX2)
+a9: TEST rAX,Iz (!REX2)
+aa: STOS/B Yb,AL (!REX2)
+ab: STOS/W/D/Q Yv,rAX (!REX2)
+ac: LODS/B AL,Xb (!REX2)
+ad: LODS/W/D/Q rAX,Xv (!REX2)
+ae: SCAS/B AL,Yb (!REX2)
# Note: The May 2011 Intel manual shows Xv for the second parameter of the
# next instruction but Yv is correct
-af: SCAS/W/D/Q rAX,Yv
+af: SCAS/W/D/Q rAX,Yv (!REX2)
# 0xb0 - 0xbf
b0: MOV AL/R8L,Ib
b1: MOV CL/R9L,Ib
@@ -266,7 +271,7 @@ d1: Grp2 Ev,1 (1A)
d2: Grp2 Eb,CL (1A)
d3: Grp2 Ev,CL (1A)
d4: AAM Ib (i64)
-d5: AAD Ib (i64)
+d5: AAD Ib (i64) | REX2 (Prefix),(o64)
d6:
d7: XLAT/XLATB
d8: ESC
@@ -281,26 +286,26 @@ df: ESC
# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
-e0: LOOPNE/LOOPNZ Jb (f64)
-e1: LOOPE/LOOPZ Jb (f64)
-e2: LOOP Jb (f64)
-e3: JrCXZ Jb (f64)
-e4: IN AL,Ib
-e5: IN eAX,Ib
-e6: OUT Ib,AL
-e7: OUT Ib,eAX
+e0: LOOPNE/LOOPNZ Jb (f64) (!REX2)
+e1: LOOPE/LOOPZ Jb (f64) (!REX2)
+e2: LOOP Jb (f64) (!REX2)
+e3: JrCXZ Jb (f64) (!REX2)
+e4: IN AL,Ib (!REX2)
+e5: IN eAX,Ib (!REX2)
+e6: OUT Ib,AL (!REX2)
+e7: OUT Ib,eAX (!REX2)
# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
# in "near" jumps and calls is 16-bit. For CALL,
# push of return address is 16-bit wide, RSP is decremented by 2
# but is not truncated to 16 bits, unlike RIP.
-e8: CALL Jz (f64)
-e9: JMP-near Jz (f64)
-ea: JMP-far Ap (i64)
-eb: JMP-short Jb (f64)
-ec: IN AL,DX
-ed: IN eAX,DX
-ee: OUT DX,AL
-ef: OUT DX,eAX
+e8: CALL Jz (f64) (!REX2)
+e9: JMP-near Jz (f64) (!REX2)
+ea: JMP-far Ap (i64) (!REX2)
+eb: JMP-short Jb (f64) (!REX2)
+ec: IN AL,DX (!REX2)
+ed: IN eAX,DX (!REX2)
+ee: OUT DX,AL (!REX2)
+ef: OUT DX,eAX (!REX2)
# 0xf0 - 0xff
f0: LOCK (Prefix)
f1:
@@ -386,14 +391,14 @@ AVXcode: 1
2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
# 0x0f 0x30-0x3f
-30: WRMSR
-31: RDTSC
-32: RDMSR
-33: RDPMC
-34: SYSENTER
-35: SYSEXIT
+30: WRMSR (!REX2)
+31: RDTSC (!REX2)
+32: RDMSR (!REX2)
+33: RDPMC (!REX2)
+34: SYSENTER (!REX2)
+35: SYSEXIT (!REX2)
36:
-37: GETSEC
+37: GETSEC (!REX2)
38: escape # 3-byte escape 1
39:
3a: escape # 3-byte escape 2
@@ -473,22 +478,22 @@ AVXcode: 1
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev)
# 0x0f 0x80-0x8f
# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
-80: JO Jz (f64)
-81: JNO Jz (f64)
-82: JB/JC/JNAE Jz (f64)
-83: JAE/JNB/JNC Jz (f64)
-84: JE/JZ Jz (f64)
-85: JNE/JNZ Jz (f64)
-86: JBE/JNA Jz (f64)
-87: JA/JNBE Jz (f64)
-88: JS Jz (f64)
-89: JNS Jz (f64)
-8a: JP/JPE Jz (f64)
-8b: JNP/JPO Jz (f64)
-8c: JL/JNGE Jz (f64)
-8d: JNL/JGE Jz (f64)
-8e: JLE/JNG Jz (f64)
-8f: JNLE/JG Jz (f64)
+80: JO Jz (f64) (!REX2)
+81: JNO Jz (f64) (!REX2)
+82: JB/JC/JNAE Jz (f64) (!REX2)
+83: JAE/JNB/JNC Jz (f64) (!REX2)
+84: JE/JZ Jz (f64) (!REX2)
+85: JNE/JNZ Jz (f64) (!REX2)
+86: JBE/JNA Jz (f64) (!REX2)
+87: JA/JNBE Jz (f64) (!REX2)
+88: JS Jz (f64) (!REX2)
+89: JNS Jz (f64) (!REX2)
+8a: JP/JPE Jz (f64) (!REX2)
+8b: JNP/JPO Jz (f64) (!REX2)
+8c: JL/JNGE Jz (f64) (!REX2)
+8d: JNL/JGE Jz (f64) (!REX2)
+8e: JLE/JNG Jz (f64) (!REX2)
+8f: JNLE/JG Jz (f64) (!REX2)
# 0x0f 0x90-0x9f
90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66)
91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66)
@@ -698,17 +703,17 @@ AVXcode: 2
4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev)
4e: vrsqrt14ps/d Vpd,Wpd (66),(ev)
4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev)
-50: vpdpbusd Vx,Hx,Wx (66),(ev)
-51: vpdpbusds Vx,Hx,Wx (66),(ev)
-52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66),(ev) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev)
-53: vpdpwssds Vx,Hx,Wx (66),(ev) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev)
+50: vpdpbusd Vx,Hx,Wx (66) | vpdpbssd Vx,Hx,Wx (F2),(v) | vpdpbsud Vx,Hx,Wx (F3),(v) | vpdpbuud Vx,Hx,Wx (v)
+51: vpdpbusds Vx,Hx,Wx (66) | vpdpbssds Vx,Hx,Wx (F2),(v) | vpdpbsuds Vx,Hx,Wx (F3),(v) | vpdpbuuds Vx,Hx,Wx (v)
+52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev)
+53: vpdpwssds Vx,Hx,Wx (66) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev)
54: vpopcntb/w Vx,Wx (66),(ev)
55: vpopcntd/q Vx,Wx (66),(ev)
58: vpbroadcastd Vx,Wx (66),(v)
59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo)
5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo)
5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev)
-5c: TDPBF16PS Vt,Wt,Ht (F3),(v1)
+5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) | TDPFP16PS Vt,Wt,Ht (F2),(v1),(o64)
# Skip 0x5d
5e: TDPBSSD Vt,Wt,Ht (F2),(v1) | TDPBSUD Vt,Wt,Ht (F3),(v1) | TDPBUSD Vt,Wt,Ht (66),(v1) | TDPBUUD Vt,Wt,Ht (v1)
# Skip 0x5f-0x61
@@ -718,10 +723,12 @@ AVXcode: 2
65: vblendmps/d Vx,Hx,Wx (66),(ev)
66: vpblendmb/w Vx,Hx,Wx (66),(ev)
68: vp2intersectd/q Kx,Hx,Wx (F2),(ev)
-# Skip 0x69-0x6f
+# Skip 0x69-0x6b
+6c: TCMMIMFP16PS Vt,Wt,Ht (66),(v1),(o64) | TCMMRLFP16PS Vt,Wt,Ht (v1),(o64)
+# Skip 0x6d-0x6f
70: vpshldvw Vx,Hx,Wx (66),(ev)
71: vpshldvd/q Vx,Hx,Wx (66),(ev)
-72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3),(ev) | vpshrdvw Vx,Hx,Wx (66),(ev)
+72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3) | vpshrdvw Vx,Hx,Wx (66),(ev)
73: vpshrdvd/q Vx,Hx,Wx (66),(ev)
75: vpermi2b/w Vx,Hx,Wx (66),(ev)
76: vpermi2d/q Vx,Hx,Wx (66),(ev)
@@ -777,8 +784,10 @@ ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
-b4: vpmadd52luq Vx,Hx,Wx (66),(ev)
-b5: vpmadd52huq Vx,Hx,Wx (66),(ev)
+b0: vcvtneebf162ps Vx,Mx (F3),(!11B),(v) | vcvtneeph2ps Vx,Mx (66),(!11B),(v) | vcvtneobf162ps Vx,Mx (F2),(!11B),(v) | vcvtneoph2ps Vx,Mx (!11B),(v)
+b1: vbcstnebf162ps Vx,Mw (F3),(!11B),(v) | vbcstnesh2ps Vx,Mw (66),(!11B),(v)
+b4: vpmadd52luq Vx,Hx,Wx (66)
+b5: vpmadd52huq Vx,Hx,Wx (66)
b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
@@ -796,15 +805,35 @@ c7: Grp19 (1A)
c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev)
c9: sha1msg1 Vdq,Wdq
ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev)
-cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev)
-cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev)
-cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev)
+cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) | vsha512rnds2 Vqq,Hqq,Udq (F2),(11B),(v)
+cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) | vsha512msg1 Vqq,Udq (F2),(11B),(v)
+cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) | vsha512msg2 Vqq,Uqq (F2),(11B),(v)
cf: vgf2p8mulb Vx,Wx (66)
+d2: vpdpwsud Vx,Hx,Wx (F3),(v) | vpdpwusd Vx,Hx,Wx (66),(v) | vpdpwuud Vx,Hx,Wx (v)
+d3: vpdpwsuds Vx,Hx,Wx (F3),(v) | vpdpwusds Vx,Hx,Wx (66),(v) | vpdpwuuds Vx,Hx,Wx (v)
+d8: AESENCWIDE128KL Qpi (F3),(000),(00B) | AESENCWIDE256KL Qpi (F3),(000),(10B) | AESDECWIDE128KL Qpi (F3),(000),(01B) | AESDECWIDE256KL Qpi (F3),(000),(11B)
+da: vsm3msg1 Vdq,Hdq,Udq (v1) | vsm3msg2 Vdq,Hdq,Udq (66),(v1) | vsm4key4 Vx,Hx,Wx (F3),(v) | vsm4rnds4 Vx,Hx,Wx (F2),(v)
db: VAESIMC Vdq,Wdq (66),(v1)
-dc: vaesenc Vx,Hx,Wx (66)
-dd: vaesenclast Vx,Hx,Wx (66)
-de: vaesdec Vx,Hx,Wx (66)
-df: vaesdeclast Vx,Hx,Wx (66)
+dc: vaesenc Vx,Hx,Wx (66) | LOADIWKEY Vx,Hx (F3) | AESENC128KL Vpd,Qpi (F3)
+dd: vaesenclast Vx,Hx,Wx (66) | AESDEC128KL Vpd,Qpi (F3)
+de: vaesdec Vx,Hx,Wx (66) | AESENC256KL Vpd,Qpi (F3)
+df: vaesdeclast Vx,Hx,Wx (66) | AESDEC256KL Vpd,Qpi (F3)
+e0: CMPOXADD My,Gy,By (66),(v1),(o64)
+e1: CMPNOXADD My,Gy,By (66),(v1),(o64)
+e2: CMPBXADD My,Gy,By (66),(v1),(o64)
+e3: CMPNBXADD My,Gy,By (66),(v1),(o64)
+e4: CMPZXADD My,Gy,By (66),(v1),(o64)
+e5: CMPNZXADD My,Gy,By (66),(v1),(o64)
+e6: CMPBEXADD My,Gy,By (66),(v1),(o64)
+e7: CMPNBEXADD My,Gy,By (66),(v1),(o64)
+e8: CMPSXADD My,Gy,By (66),(v1),(o64)
+e9: CMPNSXADD My,Gy,By (66),(v1),(o64)
+ea: CMPPXADD My,Gy,By (66),(v1),(o64)
+eb: CMPNPXADD My,Gy,By (66),(v1),(o64)
+ec: CMPLXADD My,Gy,By (66),(v1),(o64)
+ed: CMPNLXADD My,Gy,By (66),(v1),(o64)
+ee: CMPLEXADD My,Gy,By (66),(v1),(o64)
+ef: CMPNLEXADD My,Gy,By (66),(v1),(o64)
f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
@@ -812,8 +841,11 @@ f3: Grp17 (1A)
f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66)
f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
-f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3)
+f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) | URDMSR Rq,Gq (F2),(11B) | UWRMSR Gq,Rq (F3),(11B)
f9: MOVDIRI My,Gy
+fa: ENCODEKEY128 Ew,Ew (F3)
+fb: ENCODEKEY256 Ew,Ew (F3)
+fc: AADD My,Gy | AAND My,Gy (66) | AOR My,Gy (F2) | AXOR My,Gy (F3)
EndTable
Table: 3-byte opcode 2 (0x0f 0x3a)
@@ -893,10 +925,103 @@ c2: vcmpph Vx,Hx,Wx,Ib (ev) | vcmpsh Vx,Hx,Wx,Ib (F3),(ev)
cc: sha1rnds4 Vdq,Wdq,Ib
ce: vgf2p8affineqb Vx,Wx,Ib (66)
cf: vgf2p8affineinvqb Vx,Wx,Ib (66)
+de: vsm3rnds2 Vdq,Hdq,Wdq,Ib (66),(v1)
df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
f0: RORX Gy,Ey,Ib (F2),(v) | HRESET Gv,Ib (F3),(000),(11B)
EndTable
+Table: EVEX map 4
+Referrer:
+AVXcode: 4
+00: ADD Eb,Gb (ev)
+01: ADD Ev,Gv (es) | ADD Ev,Gv (66),(es)
+02: ADD Gb,Eb (ev)
+03: ADD Gv,Ev (es) | ADD Gv,Ev (66),(es)
+08: OR Eb,Gb (ev)
+09: OR Ev,Gv (es) | OR Ev,Gv (66),(es)
+0a: OR Gb,Eb (ev)
+0b: OR Gv,Ev (es) | OR Gv,Ev (66),(es)
+10: ADC Eb,Gb (ev)
+11: ADC Ev,Gv (es) | ADC Ev,Gv (66),(es)
+12: ADC Gb,Eb (ev)
+13: ADC Gv,Ev (es) | ADC Gv,Ev (66),(es)
+18: SBB Eb,Gb (ev)
+19: SBB Ev,Gv (es) | SBB Ev,Gv (66),(es)
+1a: SBB Gb,Eb (ev)
+1b: SBB Gv,Ev (es) | SBB Gv,Ev (66),(es)
+20: AND Eb,Gb (ev)
+21: AND Ev,Gv (es) | AND Ev,Gv (66),(es)
+22: AND Gb,Eb (ev)
+23: AND Gv,Ev (es) | AND Gv,Ev (66),(es)
+24: SHLD Ev,Gv,Ib (es) | SHLD Ev,Gv,Ib (66),(es)
+28: SUB Eb,Gb (ev)
+29: SUB Ev,Gv (es) | SUB Ev,Gv (66),(es)
+2a: SUB Gb,Eb (ev)
+2b: SUB Gv,Ev (es) | SUB Gv,Ev (66),(es)
+2c: SHRD Ev,Gv,Ib (es) | SHRD Ev,Gv,Ib (66),(es)
+30: XOR Eb,Gb (ev)
+31: XOR Ev,Gv (es) | XOR Ev,Gv (66),(es)
+32: XOR Gb,Eb (ev)
+33: XOR Gv,Ev (es) | XOR Gv,Ev (66),(es)
+# CCMPSCC instructions are: CCOMB, CCOMBE, CCOMF, CCOML, CCOMLE, CCOMNB, CCOMNBE, CCOMNL, CCOMNLE,
+# CCOMNO, CCOMNS, CCOMNZ, CCOMO, CCOMS, CCOMT, CCOMZ
+38: CCMPSCC Eb,Gb (ev)
+39: CCMPSCC Ev,Gv (es) | CCMPSCC Ev,Gv (66),(es)
+3a: CCMPSCC Gv,Ev (ev)
+3b: CCMPSCC Gv,Ev (es) | CCMPSCC Gv,Ev (66),(es)
+40: CMOVO Gv,Ev (es) | CMOVO Gv,Ev (66),(es) | CFCMOVO Ev,Ev (es) | CFCMOVO Ev,Ev (66),(es) | SETO Eb (F2),(ev)
+41: CMOVNO Gv,Ev (es) | CMOVNO Gv,Ev (66),(es) | CFCMOVNO Ev,Ev (es) | CFCMOVNO Ev,Ev (66),(es) | SETNO Eb (F2),(ev)
+42: CMOVB Gv,Ev (es) | CMOVB Gv,Ev (66),(es) | CFCMOVB Ev,Ev (es) | CFCMOVB Ev,Ev (66),(es) | SETB Eb (F2),(ev)
+43: CMOVNB Gv,Ev (es) | CMOVNB Gv,Ev (66),(es) | CFCMOVNB Ev,Ev (es) | CFCMOVNB Ev,Ev (66),(es) | SETNB Eb (F2),(ev)
+44: CMOVZ Gv,Ev (es) | CMOVZ Gv,Ev (66),(es) | CFCMOVZ Ev,Ev (es) | CFCMOVZ Ev,Ev (66),(es) | SETZ Eb (F2),(ev)
+45: CMOVNZ Gv,Ev (es) | CMOVNZ Gv,Ev (66),(es) | CFCMOVNZ Ev,Ev (es) | CFCMOVNZ Ev,Ev (66),(es) | SETNZ Eb (F2),(ev)
+46: CMOVBE Gv,Ev (es) | CMOVBE Gv,Ev (66),(es) | CFCMOVBE Ev,Ev (es) | CFCMOVBE Ev,Ev (66),(es) | SETBE Eb (F2),(ev)
+47: CMOVNBE Gv,Ev (es) | CMOVNBE Gv,Ev (66),(es) | CFCMOVNBE Ev,Ev (es) | CFCMOVNBE Ev,Ev (66),(es) | SETNBE Eb (F2),(ev)
+48: CMOVS Gv,Ev (es) | CMOVS Gv,Ev (66),(es) | CFCMOVS Ev,Ev (es) | CFCMOVS Ev,Ev (66),(es) | SETS Eb (F2),(ev)
+49: CMOVNS Gv,Ev (es) | CMOVNS Gv,Ev (66),(es) | CFCMOVNS Ev,Ev (es) | CFCMOVNS Ev,Ev (66),(es) | SETNS Eb (F2),(ev)
+4a: CMOVP Gv,Ev (es) | CMOVP Gv,Ev (66),(es) | CFCMOVP Ev,Ev (es) | CFCMOVP Ev,Ev (66),(es) | SETP Eb (F2),(ev)
+4b: CMOVNP Gv,Ev (es) | CMOVNP Gv,Ev (66),(es) | CFCMOVNP Ev,Ev (es) | CFCMOVNP Ev,Ev (66),(es) | SETNP Eb (F2),(ev)
+4c: CMOVL Gv,Ev (es) | CMOVL Gv,Ev (66),(es) | CFCMOVL Ev,Ev (es) | CFCMOVL Ev,Ev (66),(es) | SETL Eb (F2),(ev)
+4d: CMOVNL Gv,Ev (es) | CMOVNL Gv,Ev (66),(es) | CFCMOVNL Ev,Ev (es) | CFCMOVNL Ev,Ev (66),(es) | SETNL Eb (F2),(ev)
+4e: CMOVLE Gv,Ev (es) | CMOVLE Gv,Ev (66),(es) | CFCMOVLE Ev,Ev (es) | CFCMOVLE Ev,Ev (66),(es) | SETLE Eb (F2),(ev)
+4f: CMOVNLE Gv,Ev (es) | CMOVNLE Gv,Ev (66),(es) | CFCMOVNLE Ev,Ev (es) | CFCMOVNLE Ev,Ev (66),(es) | SETNLE Eb (F2),(ev)
+60: MOVBE Gv,Ev (es) | MOVBE Gv,Ev (66),(es)
+61: MOVBE Ev,Gv (es) | MOVBE Ev,Gv (66),(es)
+65: WRUSSD Md,Gd (66),(ev) | WRUSSQ Mq,Gq (66),(ev)
+66: ADCX Gy,Ey (66),(ev) | ADOX Gy,Ey (F3),(ev) | WRSSD Md,Gd (ev) | WRSSQ Mq,Gq (66),(ev)
+69: IMUL Gv,Ev,Iz (es) | IMUL Gv,Ev,Iz (66),(es)
+6b: IMUL Gv,Ev,Ib (es) | IMUL Gv,Ev,Ib (66),(es)
+80: Grp1 Eb,Ib (1A),(ev)
+81: Grp1 Ev,Iz (1A),(es)
+83: Grp1 Ev,Ib (1A),(es)
+# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL,
+# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ
+84: CTESTSCC Eb,Gb (ev)
+85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es)
+88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es)
+8f: POP2 Bq,Rq (000),(11B),(ev)
+a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es)
+ad: SHRD Ev,Gv,CL (es) | SHRD Ev,Gv,CL (66),(es)
+af: IMUL Gv,Ev (es) | IMUL Gv,Ev (66),(es)
+c0: Grp2 Eb,Ib (1A),(ev)
+c1: Grp2 Ev,Ib (1A),(es)
+d0: Grp2 Eb,1 (1A),(ev)
+d1: Grp2 Ev,1 (1A),(es)
+d2: Grp2 Eb,CL (1A),(ev)
+d3: Grp2 Ev,CL (1A),(es)
+f0: CRC32 Gy,Eb (es) | INVEPT Gq,Mdq (F3),(ev)
+f1: CRC32 Gy,Ey (es) | CRC32 Gy,Ey (66),(es) | INVVPID Gy,Mdq (F3),(ev)
+f2: INVPCID Gy,Mdq (F3),(ev)
+f4: TZCNT Gv,Ev (es) | TZCNT Gv,Ev (66),(es)
+f5: LZCNT Gv,Ev (es) | LZCNT Gv,Ev (66),(es)
+f6: Grp3_1 Eb (1A),(ev)
+f7: Grp3_2 Ev (1A),(es)
+f8: MOVDIR64B Gv,Mdqq (66),(ev) | ENQCMD Gv,Mdqq (F2),(ev) | ENQCMDS Gv,Mdqq (F3),(ev) | URDMSR Rq,Gq (F2),(11B),(ev) | UWRMSR Gq,Rq (F3),(11B),(ev)
+f9: MOVDIRI My,Gy (ev)
+fe: Grp4 (1A),(ev)
+ff: Grp5 (1A),(es) | PUSH2 Bq,Rq (110),(11B),(ev)
+EndTable
+
Table: EVEX map 5
Referrer:
AVXcode: 5
@@ -975,6 +1100,12 @@ d6: vfcmulcph Vx,Hx,Wx (F2),(ev) | vfmulcph Vx,Hx,Wx (F3),(ev)
d7: vfcmulcsh Vx,Hx,Wx (F2),(ev) | vfmulcsh Vx,Hx,Wx (F3),(ev)
EndTable
+Table: VEX map 7
+Referrer:
+AVXcode: 7
+f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B)
+EndTable
+
GrpTable: Grp1
0: ADD
1: OR
@@ -1051,7 +1182,7 @@ GrpTable: Grp6
EndTable
GrpTable: Grp7
-0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B)
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) | RDMSRLIST (F2),(110),(11B) | WRMSRLIST (F3),(110),(11B) | PBNDKB (111),(11B)
1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
3: LIDT Ms
@@ -1137,6 +1268,8 @@ GrpTable: Grp16
1: prefetch T0
2: prefetch T1
3: prefetch T2
+6: prefetch IT1
+7: prefetch IT0
EndTable
GrpTable: Grp17