#include #include #include #include // Declare the assembly function void *fast_memcpy(void *dst, const void *src, size_t n); // Function to measure time in nanoseconds static inline unsigned long long rdtsc(void) { unsigned int lo, hi; __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi)); return ((unsigned long long)hi << 32) | lo; } int main() { // Buffer sizes to test size_t sizes[] = {64, 256, 1024, 4096, 1048576}; // 64B, 256B, 1KB, 4KB, 1MB int num_sizes = sizeof(sizes) / sizeof(sizes[0]); int iterations = 1000000; // Adjust based on size for reasonable runtime // Allocate aligned buffers void *src, *dst; posix_memalign(&src, 32, sizes[num_sizes - 1]); posix_memalign(&dst, 32, sizes[num_sizes - 1]); // Fill source buffer with data memset(src, 0xAA, sizes[num_sizes - 1]); for (int s = 0; s < num_sizes; s++) { size_t size = sizes[s]; iterations = (size < 4096) ? 1000000 : 10000; // Fewer iterations for large sizes // Test fast_memcpy unsigned long long start, end, cycles_fast = 0; for (int i = 0; i < iterations; i++) { start = rdtsc(); fast_memcpy(dst, src, size); end = rdtsc(); cycles_fast += (end - start); } // Test standard memcpy unsigned long long cycles_std = 0; for (int i = 0; i < iterations; i++) { start = rdtsc(); memcpy(dst, src, size); end = rdtsc(); cycles_std += (end - start); } // Estimate CPU frequency (rough approximation, adjust for your i9) double cpu_freq_ghz = 2.4; // i9-9980HK base frequency is ~2.4 GHz double time_fast_ns = (cycles_fast / (double)iterations) / cpu_freq_ghz; double time_std_ns = (cycles_std / (double)iterations) / cpu_freq_ghz; // Calculate bandwidth (bytes per second) double bandwidth_fast = (size / time_fast_ns) * 1e9; // GB/s double bandwidth_std = (size / time_std_ns) * 1e9; // GB/s printf("Size: %zu bytes\n", size); printf("fast_memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_fast / 1e9, (double)cycles_fast / iterations); printf("std memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_std / 1e9, (double)cycles_std / iterations); printf("--------------------\n"); } free(src); free(dst); return 0; }