Files
xrt/src/win64/memset.rs
Jessie 7c2ba320fa init
2024-01-26 21:03:30 -05:00

130 lines
3.5 KiB
Rust

// https://github.com/nadavrot/memset_benchmark/blob/main/src/memset/impl.c
core::arch::global_asm!{r#"
.text
.globl memset
memset:
mov rax, rcx
cmp r8, 31
ja .LBB0_10
cmp r8, 4
ja .LBB0_5
test r8, r8
je .LBB0_26
mov byte ptr [rax], dl
mov byte ptr [r8 + rax - 1], dl
cmp r8, 3
jb .LBB0_26
mov byte ptr [rax + 1], dl
mov byte ptr [rax + 2], dl
ret
.LBB0_10:
movzx ecx, dl
movd xmm0, ecx
punpcklbw xmm0, xmm0
pshuflw xmm0, xmm0, 0
pshufd xmm0, xmm0, 0
cmp r8, 161
jb .LBB0_23
movdqu xmmword ptr [rax], xmm0
movdqu xmmword ptr [rax + 16], xmm0
mov rdx, rax
and rdx, -32
lea r9, [rax + r8]
lea rcx, [rax + r8]
add rcx, -32
lea r8, [rdx + 192]
cmp r8, r9
jae .LBB0_12
.LBB0_13:
movdqa xmmword ptr [r8 - 160], xmm0
movdqa xmmword ptr [r8 - 144], xmm0
movdqa xmmword ptr [r8 - 128], xmm0
movdqa xmmword ptr [r8 - 112], xmm0
movdqa xmmword ptr [r8 - 96], xmm0
movdqa xmmword ptr [r8 - 80], xmm0
movdqa xmmword ptr [r8 - 64], xmm0
movdqa xmmword ptr [r8 - 48], xmm0
movdqa xmmword ptr [r8 - 32], xmm0
movdqa xmmword ptr [r8 - 16], xmm0
add r8, 160
cmp r8, r9
jb .LBB0_13
add r8, -160
mov rdx, r8
cmp rdx, rcx
jb .LBB0_16
jmp .LBB0_17
.LBB0_5:
cmp r8, 16
ja .LBB0_9
movzx edx, dl
movabs rcx, 72340172838076673
imul rcx, rdx
cmp r8, 8
jb .LBB0_8
mov qword ptr [rax], rcx
mov qword ptr [rax + r8 - 8], rcx
ret
.LBB0_23:
lea rcx, [rax + r8]
add rcx, -32
mov rdx, rax
.LBB0_24:
movdqu xmmword ptr [rdx], xmm0
movdqu xmmword ptr [rdx + 16], xmm0
add rdx, 32
cmp rdx, rcx
jb .LBB0_24
jmp .LBB0_25
.LBB0_12:
add rdx, 32
cmp rdx, rcx
jae .LBB0_17
.LBB0_16:
movdqa xmmword ptr [rdx], xmm0
movdqa xmmword ptr [rdx + 16], xmm0
add rdx, 32
.LBB0_17:
cmp rdx, rcx
jb .LBB0_18
cmp rdx, rcx
jb .LBB0_20
.LBB0_21:
cmp rdx, rcx
jae .LBB0_25
.LBB0_22:
movdqa xmmword ptr [rdx], xmm0
movdqa xmmword ptr [rdx + 16], xmm0
.LBB0_25:
movdqu xmmword ptr [rcx], xmm0
movdqu xmmword ptr [rcx + 16], xmm0
.LBB0_26:
ret
.LBB0_9:
movzx ecx, dl
movd xmm0, ecx
punpcklbw xmm0, xmm0
pshuflw xmm0, xmm0, 0
pshufd xmm0, xmm0, 0
movdqu xmmword ptr [rax + r8 - 16], xmm0
movdqu xmmword ptr [rax], xmm0
ret
.LBB0_18:
movdqa xmmword ptr [rdx], xmm0
movdqa xmmword ptr [rdx + 16], xmm0
add rdx, 32
cmp rdx, rcx
jae .LBB0_21
.LBB0_20:
movdqa xmmword ptr [rdx], xmm0
movdqa xmmword ptr [rdx + 16], xmm0
add rdx, 32
cmp rdx, rcx
jb .LBB0_22
jmp .LBB0_25
.LBB0_8:
mov dword ptr [rax], ecx
mov dword ptr [rax + r8 - 4], ecx
ret
"#}