Skip to content

Instantly share code, notes, and snippets.

@alonsoir
Created April 16, 2025 07:03
Show Gist options
  • Save alonsoir/3120c3f92638be18174065415b0b6104 to your computer and use it in GitHub Desktop.
Save alonsoir/3120c3f92638be18174065415b0b6104 to your computer and use it in GitHub Desktop.

Revisions

  1. alonsoir created this gist Apr 16, 2025.
    3 changes: 3 additions & 0 deletions compile.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,3 @@
    nasm -f macho64 fast_memcpy.asm -o fast_memcpy.o
    clang -O2 test_memcpy.c fast_memcpy.o -o test_memcpy
    ./test_memcpy
    97 changes: 97 additions & 0 deletions fast_memcpy.asm
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,97 @@
    section .text
    global _fast_memcpy

    _fast_memcpy:
    push rbx

    ; If size < 256 bytes, use scalar/SSE copy
    cmp rdx, 256
    jb .small_copy

    mov rax, rdi ; Save destination for return

    ; Align destination to 32-byte boundary
    mov rbx, rdi
    and rbx, 31
    jz .aligned

    mov rcx, 32
    sub rcx, rbx
    sub rdx, rcx
    mov r9, rsi
    mov r10, rdi
    mov rdi, r10
    mov rsi, r9
    rep movsb
    mov rdi, r10
    mov rsi, r9

    .aligned:
    mov r9, rdx
    shr r9, 5
    jz .use_erms

    .avx_loop:
    vmovdqu ymm0, [rsi]
    vmovdqa [rdi], ymm0
    add rdi, 32
    add rsi, 32
    dec r9
    jnz .avx_loop

    mov r9, rdx
    and r9, 31
    mov rdx, r9

    .use_erms:
    cmp rdx, 4096
    jb .no_prefetch

    mov rcx, rdx
    shr rcx, 6
    mov r9, rsi
    .prefetch_loop:
    prefetcht0 [r9 + 512]
    add r9, 64
    dec rcx
    jnz .prefetch_loop

    .no_prefetch:
    mov rcx, rdx
    rep movsb
    jmp .done

    .small_copy:
    cmp rdx, 16
    jb .tiny_copy

    mov rcx, rdx
    shr rcx, 4
    .sse_loop:
    movdqu xmm0, [rsi]
    movdqu [rdi], xmm0
    add rsi, 16
    add rdi, 16
    dec rcx
    jnz .sse_loop

    mov rcx, rdx
    and rcx, 15
    rep movsb
    mov rax, rdi
    jmp .done

    .tiny_copy:
    mov rcx, rdx
    rep movsb
    mov rax, rdi
    jmp .done

    .tail:
    mov rcx, rdx
    rep movsb

    .done:
    vzeroupper
    pop rbx
    ret
    21 changes: 21 additions & 0 deletions output.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,21 @@
    ./compile_and_test_memcpy_test.sh
    ld: warning: no platform load command found in '/private/tmp/memcpy_test/fast_memcpy.o', assuming: macOS
    Size: 64 bytes
    fast_memcpy: 3.19 GB/s (48.08 cycles/iter)
    std memcpy: 10.97 GB/s (14.00 cycles/iter)
    --------------------
    Size: 256 bytes
    fast_memcpy: 11.49 GB/s (53.47 cycles/iter)
    std memcpy: 45.87 GB/s (13.39 cycles/iter)
    --------------------
    Size: 1024 bytes
    fast_memcpy: 33.02 GB/s (74.43 cycles/iter)
    std memcpy: 87.47 GB/s (28.10 cycles/iter)
    --------------------
    Size: 4096 bytes
    fast_memcpy: 44.69 GB/s (219.98 cycles/iter)
    std memcpy: 72.05 GB/s (136.45 cycles/iter)
    --------------------
    Size: 1048576 bytes
    fast_memcpy: 27.24 GB/s (92395.32 cycles/iter)
    std memcpy: 33.85 GB/s (74350.49 cycles/iter)
    70 changes: 70 additions & 0 deletions test_memcpy.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,70 @@
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <time.h>

    // Declare the assembly function
    void *fast_memcpy(void *dst, const void *src, size_t n);

    // Function to measure time in nanoseconds
    static inline unsigned long long rdtsc(void) {
    unsigned int lo, hi;
    __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
    return ((unsigned long long)hi << 32) | lo;
    }

    int main() {
    // Buffer sizes to test
    size_t sizes[] = {64, 256, 1024, 4096, 1048576}; // 64B, 256B, 1KB, 4KB, 1MB
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
    int iterations = 1000000; // Adjust based on size for reasonable runtime

    // Allocate aligned buffers
    void *src, *dst;
    posix_memalign(&src, 32, sizes[num_sizes - 1]);
    posix_memalign(&dst, 32, sizes[num_sizes - 1]);

    // Fill source buffer with data
    memset(src, 0xAA, sizes[num_sizes - 1]);

    for (int s = 0; s < num_sizes; s++) {
    size_t size = sizes[s];
    iterations = (size < 4096) ? 1000000 : 10000; // Fewer iterations for large sizes

    // Test fast_memcpy
    unsigned long long start, end, cycles_fast = 0;
    for (int i = 0; i < iterations; i++) {
    start = rdtsc();
    fast_memcpy(dst, src, size);
    end = rdtsc();
    cycles_fast += (end - start);
    }

    // Test standard memcpy
    unsigned long long cycles_std = 0;
    for (int i = 0; i < iterations; i++) {
    start = rdtsc();
    memcpy(dst, src, size);
    end = rdtsc();
    cycles_std += (end - start);
    }

    // Estimate CPU frequency (rough approximation, adjust for your i9)
    double cpu_freq_ghz = 2.4; // i9-9980HK base frequency is ~2.4 GHz
    double time_fast_ns = (cycles_fast / (double)iterations) / cpu_freq_ghz;
    double time_std_ns = (cycles_std / (double)iterations) / cpu_freq_ghz;

    // Calculate bandwidth (bytes per second)
    double bandwidth_fast = (size / time_fast_ns) * 1e9; // GB/s
    double bandwidth_std = (size / time_std_ns) * 1e9; // GB/s

    printf("Size: %zu bytes\n", size);
    printf("fast_memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_fast / 1e9, (double)cycles_fast / iterations);
    printf("std memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_std / 1e9, (double)cycles_std / iterations);
    printf("--------------------\n");
    }

    free(src);
    free(dst);
    return 0;
    }