130 lines
3.5 KiB
Rust
130 lines
3.5 KiB
Rust
// https://github.com/nadavrot/memset_benchmark/blob/main/src/memset/impl.c
|
|
|
|
core::arch::global_asm!{r#"
|
|
.text
|
|
.globl memset
|
|
memset:
|
|
mov rax, rcx
|
|
cmp r8, 31
|
|
ja .LBB0_10
|
|
cmp r8, 4
|
|
ja .LBB0_5
|
|
test r8, r8
|
|
je .LBB0_26
|
|
mov byte ptr [rax], dl
|
|
mov byte ptr [r8 + rax - 1], dl
|
|
cmp r8, 3
|
|
jb .LBB0_26
|
|
mov byte ptr [rax + 1], dl
|
|
mov byte ptr [rax + 2], dl
|
|
ret
|
|
.LBB0_10:
|
|
movzx ecx, dl
|
|
movd xmm0, ecx
|
|
punpcklbw xmm0, xmm0
|
|
pshuflw xmm0, xmm0, 0
|
|
pshufd xmm0, xmm0, 0
|
|
cmp r8, 161
|
|
jb .LBB0_23
|
|
movdqu xmmword ptr [rax], xmm0
|
|
movdqu xmmword ptr [rax + 16], xmm0
|
|
mov rdx, rax
|
|
and rdx, -32
|
|
lea r9, [rax + r8]
|
|
lea rcx, [rax + r8]
|
|
add rcx, -32
|
|
lea r8, [rdx + 192]
|
|
cmp r8, r9
|
|
jae .LBB0_12
|
|
.LBB0_13:
|
|
movdqa xmmword ptr [r8 - 160], xmm0
|
|
movdqa xmmword ptr [r8 - 144], xmm0
|
|
movdqa xmmword ptr [r8 - 128], xmm0
|
|
movdqa xmmword ptr [r8 - 112], xmm0
|
|
movdqa xmmword ptr [r8 - 96], xmm0
|
|
movdqa xmmword ptr [r8 - 80], xmm0
|
|
movdqa xmmword ptr [r8 - 64], xmm0
|
|
movdqa xmmword ptr [r8 - 48], xmm0
|
|
movdqa xmmword ptr [r8 - 32], xmm0
|
|
movdqa xmmword ptr [r8 - 16], xmm0
|
|
add r8, 160
|
|
cmp r8, r9
|
|
jb .LBB0_13
|
|
add r8, -160
|
|
mov rdx, r8
|
|
cmp rdx, rcx
|
|
jb .LBB0_16
|
|
jmp .LBB0_17
|
|
.LBB0_5:
|
|
cmp r8, 16
|
|
ja .LBB0_9
|
|
movzx edx, dl
|
|
movabs rcx, 72340172838076673
|
|
imul rcx, rdx
|
|
cmp r8, 8
|
|
jb .LBB0_8
|
|
mov qword ptr [rax], rcx
|
|
mov qword ptr [rax + r8 - 8], rcx
|
|
ret
|
|
.LBB0_23:
|
|
lea rcx, [rax + r8]
|
|
add rcx, -32
|
|
mov rdx, rax
|
|
.LBB0_24:
|
|
movdqu xmmword ptr [rdx], xmm0
|
|
movdqu xmmword ptr [rdx + 16], xmm0
|
|
add rdx, 32
|
|
cmp rdx, rcx
|
|
jb .LBB0_24
|
|
jmp .LBB0_25
|
|
.LBB0_12:
|
|
add rdx, 32
|
|
cmp rdx, rcx
|
|
jae .LBB0_17
|
|
.LBB0_16:
|
|
movdqa xmmword ptr [rdx], xmm0
|
|
movdqa xmmword ptr [rdx + 16], xmm0
|
|
add rdx, 32
|
|
.LBB0_17:
|
|
cmp rdx, rcx
|
|
jb .LBB0_18
|
|
cmp rdx, rcx
|
|
jb .LBB0_20
|
|
.LBB0_21:
|
|
cmp rdx, rcx
|
|
jae .LBB0_25
|
|
.LBB0_22:
|
|
movdqa xmmword ptr [rdx], xmm0
|
|
movdqa xmmword ptr [rdx + 16], xmm0
|
|
.LBB0_25:
|
|
movdqu xmmword ptr [rcx], xmm0
|
|
movdqu xmmword ptr [rcx + 16], xmm0
|
|
.LBB0_26:
|
|
ret
|
|
.LBB0_9:
|
|
movzx ecx, dl
|
|
movd xmm0, ecx
|
|
punpcklbw xmm0, xmm0
|
|
pshuflw xmm0, xmm0, 0
|
|
pshufd xmm0, xmm0, 0
|
|
movdqu xmmword ptr [rax + r8 - 16], xmm0
|
|
movdqu xmmword ptr [rax], xmm0
|
|
ret
|
|
.LBB0_18:
|
|
movdqa xmmword ptr [rdx], xmm0
|
|
movdqa xmmword ptr [rdx + 16], xmm0
|
|
add rdx, 32
|
|
cmp rdx, rcx
|
|
jae .LBB0_21
|
|
.LBB0_20:
|
|
movdqa xmmword ptr [rdx], xmm0
|
|
movdqa xmmword ptr [rdx + 16], xmm0
|
|
add rdx, 32
|
|
cmp rdx, rcx
|
|
jb .LBB0_22
|
|
jmp .LBB0_25
|
|
.LBB0_8:
|
|
mov dword ptr [rax], ecx
|
|
mov dword ptr [rax + r8 - 4], ecx
|
|
ret
|
|
"#} |