section .text global _fast_memcpy _fast_memcpy: push rbx ; If size < 256 bytes, use scalar/SSE copy cmp rdx, 256 jb .small_copy mov rax, rdi ; Save destination for return ; Align destination to 32-byte boundary mov rbx, rdi and rbx, 31 jz .aligned mov rcx, 32 sub rcx, rbx sub rdx, rcx mov r9, rsi mov r10, rdi mov rdi, r10 mov rsi, r9 rep movsb mov rdi, r10 mov rsi, r9 .aligned: mov r9, rdx shr r9, 5 jz .use_erms .avx_loop: vmovdqu ymm0, [rsi] vmovdqa [rdi], ymm0 add rdi, 32 add rsi, 32 dec r9 jnz .avx_loop mov r9, rdx and r9, 31 mov rdx, r9 .use_erms: cmp rdx, 4096 jb .no_prefetch mov rcx, rdx shr rcx, 6 mov r9, rsi .prefetch_loop: prefetcht0 [r9 + 512] add r9, 64 dec rcx jnz .prefetch_loop .no_prefetch: mov rcx, rdx rep movsb jmp .done .small_copy: cmp rdx, 16 jb .tiny_copy mov rcx, rdx shr rcx, 4 .sse_loop: movdqu xmm0, [rsi] movdqu [rdi], xmm0 add rsi, 16 add rdi, 16 dec rcx jnz .sse_loop mov rcx, rdx and rcx, 15 rep movsb mov rax, rdi jmp .done .tiny_copy: mov rcx, rdx rep movsb mov rax, rdi jmp .done .tail: mov rcx, rdx rep movsb .done: vzeroupper pop rbx ret