summaryrefslogtreecommitdiffstats
path: root/MdePkg/Library/BaseMemoryLibSse2/X64
diff options
context:
space:
mode:
Diffstat (limited to 'MdePkg/Library/BaseMemoryLibSse2/X64')
-rw-r--r--MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm9
-rw-r--r--MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm11
-rw-r--r--MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm9
-rw-r--r--MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm19
-rw-r--r--MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm13
5 files changed, 42 insertions, 19 deletions
diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm
index 5bd1c2262d..28b11ee586 100644
--- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm
+++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm
@@ -42,8 +42,8 @@ ASM_PFX(InternalMemSetMem):
rep stosb
.0:
mov rcx, rdx
- and rdx, 15
- shr rcx, 4
+ and rdx, 63
+ shr rcx, 6
jz @SetBytes
mov ah, al ; ax <- Value repeats twice
movdqa [rsp + 0x10], xmm0 ; save xmm0
@@ -52,7 +52,10 @@ ASM_PFX(InternalMemSetMem):
movlhps xmm0, xmm0 ; xmm0 <- Value repeats 16 times
.1:
movntdq [rdi], xmm0 ; rdi should be 16-byte aligned
- add rdi, 16
+ movntdq [rdi + 16], xmm0
+ movntdq [rdi + 32], xmm0
+ movntdq [rdi + 48], xmm0
+ add rdi, 64
loop .1
mfence
movdqa xmm0, [rsp + 0x10] ; restore xmm0
diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm
index 90d159820a..375be19313 100644
--- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm
+++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm
@@ -33,7 +33,7 @@ ASM_PFX(InternalMemSetMem16):
mov r9, rdi
xor rcx, rcx
sub rcx, rdi
- and rcx, 15
+ and rcx, 63
mov rax, r8
jz .0
shr rcx, 1
@@ -43,15 +43,18 @@ ASM_PFX(InternalMemSetMem16):
rep stosw
.0:
mov rcx, rdx
- and edx, 7
- shr rcx, 3
+ and edx, 31
+ shr rcx, 5
jz @SetWords
movd xmm0, eax
pshuflw xmm0, xmm0, 0
movlhps xmm0, xmm0
.1:
movntdq [rdi], xmm0
- add rdi, 16
+ movntdq [rdi + 16], xmm0
+ movntdq [rdi + 32], xmm0
+ movntdq [rdi + 48], xmm0
+ add rdi, 64
loop .1
mfence
@SetWords:
diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm
index 928e086889..5d12beaa9a 100644
--- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm
+++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm
@@ -43,14 +43,17 @@ ASM_PFX(InternalMemSetMem32):
rep stosd
.0:
mov rcx, rdx
- and edx, 3
- shr rcx, 2
+ and edx, 15
+ shr rcx, 4
jz @SetDwords
movd xmm0, eax
pshufd xmm0, xmm0, 0
.1:
movntdq [rdi], xmm0
- add rdi, 16
+ movntdq [rdi + 16], xmm0
+ movntdq [rdi + 32], xmm0
+ movntdq [rdi + 48], xmm0
+ add rdi, 64
loop .1
mfence
@SetDwords:
diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm
index d771810542..485f74ddac 100644
--- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm
+++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm
@@ -37,17 +37,28 @@ ASM_PFX(InternalMemSetMem64):
add rdx, 8
dec rcx
.0:
- shr rcx, 1
+ push rbx
+ mov rbx, rcx
+ and rbx, 7
+ shr rcx, 3
jz @SetQwords
movlhps xmm0, xmm0
.1:
movntdq [rdx], xmm0
- lea rdx, [rdx + 16]
+ movntdq [rdx + 16], xmm0
+ movntdq [rdx + 32], xmm0
+ movntdq [rdx + 48], xmm0
+ lea rdx, [rdx + 64]
loop .1
mfence
@SetQwords:
- jnc .2
- mov [rdx], r8
+ push rdi
+ mov rcx, rbx
+ mov rax, r8
+ mov rdi, rdx
+ rep stosq
+ pop rdi
.2:
+ pop rbx
ret
diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm
index 5ddcae9ca5..21f504e3b7 100644
--- a/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm
+++ b/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm
@@ -32,7 +32,7 @@ ASM_PFX(InternalMemZeroMem):
xor rcx, rcx
xor eax, eax
sub rcx, rdi
- and rcx, 15
+ and rcx, 63
mov r8, rdi
jz .0
cmp rcx, rdx
@@ -41,13 +41,16 @@ ASM_PFX(InternalMemZeroMem):
rep stosb
.0:
mov rcx, rdx
- and edx, 15
- shr rcx, 4
+ and edx, 63
+ shr rcx, 6
jz @ZeroBytes
pxor xmm0, xmm0
.1:
- movntdq [rdi], xmm0 ; rdi should be 16-byte aligned
- add rdi, 16
+ movntdq [rdi], xmm0
+ movntdq [rdi + 16], xmm0
+ movntdq [rdi + 32], xmm0
+ movntdq [rdi + 48], xmm0
+ add rdi, 64
loop .1
mfence
@ZeroBytes: