// https://github.com/nadavrot/memset_benchmark/blob/main/src/memset/impl.c core::arch::global_asm!{r#" .text .globl memset memset: mov rax, rcx cmp r8, 31 ja .LBB0_10 cmp r8, 4 ja .LBB0_5 test r8, r8 je .LBB0_26 mov byte ptr [rax], dl mov byte ptr [r8 + rax - 1], dl cmp r8, 3 jb .LBB0_26 mov byte ptr [rax + 1], dl mov byte ptr [rax + 2], dl ret .LBB0_10: movzx ecx, dl movd xmm0, ecx punpcklbw xmm0, xmm0 pshuflw xmm0, xmm0, 0 pshufd xmm0, xmm0, 0 cmp r8, 161 jb .LBB0_23 movdqu xmmword ptr [rax], xmm0 movdqu xmmword ptr [rax + 16], xmm0 mov rdx, rax and rdx, -32 lea r9, [rax + r8] lea rcx, [rax + r8] add rcx, -32 lea r8, [rdx + 192] cmp r8, r9 jae .LBB0_12 .LBB0_13: movdqa xmmword ptr [r8 - 160], xmm0 movdqa xmmword ptr [r8 - 144], xmm0 movdqa xmmword ptr [r8 - 128], xmm0 movdqa xmmword ptr [r8 - 112], xmm0 movdqa xmmword ptr [r8 - 96], xmm0 movdqa xmmword ptr [r8 - 80], xmm0 movdqa xmmword ptr [r8 - 64], xmm0 movdqa xmmword ptr [r8 - 48], xmm0 movdqa xmmword ptr [r8 - 32], xmm0 movdqa xmmword ptr [r8 - 16], xmm0 add r8, 160 cmp r8, r9 jb .LBB0_13 add r8, -160 mov rdx, r8 cmp rdx, rcx jb .LBB0_16 jmp .LBB0_17 .LBB0_5: cmp r8, 16 ja .LBB0_9 movzx edx, dl movabs rcx, 72340172838076673 imul rcx, rdx cmp r8, 8 jb .LBB0_8 mov qword ptr [rax], rcx mov qword ptr [rax + r8 - 8], rcx ret .LBB0_23: lea rcx, [rax + r8] add rcx, -32 mov rdx, rax .LBB0_24: movdqu xmmword ptr [rdx], xmm0 movdqu xmmword ptr [rdx + 16], xmm0 add rdx, 32 cmp rdx, rcx jb .LBB0_24 jmp .LBB0_25 .LBB0_12: add rdx, 32 cmp rdx, rcx jae .LBB0_17 .LBB0_16: movdqa xmmword ptr [rdx], xmm0 movdqa xmmword ptr [rdx + 16], xmm0 add rdx, 32 .LBB0_17: cmp rdx, rcx jb .LBB0_18 cmp rdx, rcx jb .LBB0_20 .LBB0_21: cmp rdx, rcx jae .LBB0_25 .LBB0_22: movdqa xmmword ptr [rdx], xmm0 movdqa xmmword ptr [rdx + 16], xmm0 .LBB0_25: movdqu xmmword ptr [rcx], xmm0 movdqu xmmword ptr [rcx + 16], xmm0 .LBB0_26: ret .LBB0_9: movzx ecx, dl movd xmm0, ecx punpcklbw xmm0, xmm0 pshuflw xmm0, xmm0, 0 pshufd xmm0, xmm0, 0 movdqu xmmword ptr [rax + r8 - 16], xmm0 movdqu xmmword ptr [rax], xmm0 ret .LBB0_18: movdqa xmmword ptr [rdx], xmm0 movdqa xmmword ptr [rdx + 16], xmm0 add rdx, 32 cmp rdx, rcx jae .LBB0_21 .LBB0_20: movdqa xmmword ptr [rdx], xmm0 movdqa xmmword ptr [rdx + 16], xmm0 add rdx, 32 cmp rdx, rcx jb .LBB0_22 jmp .LBB0_25 .LBB0_8: mov dword ptr [rax], ecx mov dword ptr [rax + r8 - 4], ecx ret "#}