Created
April 16, 2025 07:03
-
-
Save alonsoir/3120c3f92638be18174065415b0b6104 to your computer and use it in GitHub Desktop.
Revisions
-
alonsoir created this gist
Apr 16, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,3 @@ nasm -f macho64 fast_memcpy.asm -o fast_memcpy.o clang -O2 test_memcpy.c fast_memcpy.o -o test_memcpy ./test_memcpy This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,97 @@ section .text global _fast_memcpy _fast_memcpy: push rbx ; If size < 256 bytes, use scalar/SSE copy cmp rdx, 256 jb .small_copy mov rax, rdi ; Save destination for return ; Align destination to 32-byte boundary mov rbx, rdi and rbx, 31 jz .aligned mov rcx, 32 sub rcx, rbx sub rdx, rcx mov r9, rsi mov r10, rdi mov rdi, r10 mov rsi, r9 rep movsb mov rdi, r10 mov rsi, r9 .aligned: mov r9, rdx shr r9, 5 jz .use_erms .avx_loop: vmovdqu ymm0, [rsi] vmovdqa [rdi], ymm0 add rdi, 32 add rsi, 32 dec r9 jnz .avx_loop mov r9, rdx and r9, 31 mov rdx, r9 .use_erms: cmp rdx, 4096 jb .no_prefetch mov rcx, rdx shr rcx, 6 mov r9, rsi .prefetch_loop: prefetcht0 [r9 + 512] add r9, 64 dec rcx jnz .prefetch_loop .no_prefetch: mov rcx, rdx rep movsb jmp .done .small_copy: cmp rdx, 16 jb .tiny_copy mov rcx, rdx shr rcx, 4 .sse_loop: movdqu xmm0, [rsi] movdqu [rdi], xmm0 add rsi, 16 add rdi, 16 dec rcx jnz .sse_loop mov rcx, rdx and rcx, 15 rep movsb mov rax, rdi jmp .done .tiny_copy: mov rcx, rdx rep movsb mov rax, rdi jmp .done .tail: mov rcx, rdx rep movsb .done: vzeroupper pop rbx ret This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,21 @@ ./compile_and_test_memcpy_test.sh ld: warning: no platform load command found in '/private/tmp/memcpy_test/fast_memcpy.o', assuming: macOS Size: 64 bytes fast_memcpy: 3.19 GB/s (48.08 cycles/iter) std memcpy: 10.97 GB/s (14.00 cycles/iter) -------------------- Size: 256 bytes fast_memcpy: 11.49 GB/s (53.47 cycles/iter) std memcpy: 45.87 GB/s (13.39 cycles/iter) -------------------- Size: 1024 bytes fast_memcpy: 33.02 GB/s (74.43 cycles/iter) std memcpy: 87.47 GB/s (28.10 cycles/iter) -------------------- Size: 4096 bytes fast_memcpy: 44.69 GB/s (219.98 cycles/iter) std memcpy: 72.05 GB/s (136.45 cycles/iter) -------------------- Size: 1048576 bytes fast_memcpy: 27.24 GB/s (92395.32 cycles/iter) std memcpy: 33.85 GB/s (74350.49 cycles/iter) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,70 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> // Declare the assembly function void *fast_memcpy(void *dst, const void *src, size_t n); // Function to measure time in nanoseconds static inline unsigned long long rdtsc(void) { unsigned int lo, hi; __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi)); return ((unsigned long long)hi << 32) | lo; } int main() { // Buffer sizes to test size_t sizes[] = {64, 256, 1024, 4096, 1048576}; // 64B, 256B, 1KB, 4KB, 1MB int num_sizes = sizeof(sizes) / sizeof(sizes[0]); int iterations = 1000000; // Adjust based on size for reasonable runtime // Allocate aligned buffers void *src, *dst; posix_memalign(&src, 32, sizes[num_sizes - 1]); posix_memalign(&dst, 32, sizes[num_sizes - 1]); // Fill source buffer with data memset(src, 0xAA, sizes[num_sizes - 1]); for (int s = 0; s < num_sizes; s++) { size_t size = sizes[s]; iterations = (size < 4096) ? 1000000 : 10000; // Fewer iterations for large sizes // Test fast_memcpy unsigned long long start, end, cycles_fast = 0; for (int i = 0; i < iterations; i++) { start = rdtsc(); fast_memcpy(dst, src, size); end = rdtsc(); cycles_fast += (end - start); } // Test standard memcpy unsigned long long cycles_std = 0; for (int i = 0; i < iterations; i++) { start = rdtsc(); memcpy(dst, src, size); end = rdtsc(); cycles_std += (end - start); } // Estimate CPU frequency (rough approximation, adjust for your i9) double cpu_freq_ghz = 2.4; // i9-9980HK base frequency is ~2.4 GHz double time_fast_ns = (cycles_fast / (double)iterations) / cpu_freq_ghz; double time_std_ns = (cycles_std / (double)iterations) / cpu_freq_ghz; // Calculate bandwidth (bytes per second) double bandwidth_fast = (size / time_fast_ns) * 1e9; // GB/s double bandwidth_std = (size / time_std_ns) * 1e9; // GB/s printf("Size: %zu bytes\n", size); printf("fast_memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_fast / 1e9, (double)cycles_fast / iterations); printf("std memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_std / 1e9, (double)cycles_std / iterations); printf("--------------------\n"); } free(src); free(dst); return 0; }