// http://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched/ /* One drawback of the RDTSC instruction is that the CPU is allowed to reorder it relative to other instructions, which causes noise in our results. Fortunately, Intel has provided an RDTSCP instruction that’s more deterministic. We’ll pair that with a CPUID instruction which acts as a memory barrier, resulting in this: */ static __inline__ int64_t rdtsc_s(void) { unsigned a, d; asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); asm volatile("rdtsc" : "=a" (a), "=d" (d)); return ((unsigned long)a) | (((unsigned long)d) << 32); } static __inline__ int64_t rdtsc_e(void) { unsigned a, d; asm volatile("rdtscp" : "=a" (a), "=d" (d)); asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); return ((unsigned long)a) | (((unsigned long)d) << 32); } . . . clocks_before = rdtsc_s (); p = malloc (i); /* Test goes here */ clocks_after = rdtsc_e (); clocks_per_malloc = clocks_after - clocks_before; // let the OS use CPU #0 // boot options: // linux . . . isolcpus=1,2,3,4,5,6,7 // check: // taskset -p $$ // Interrupt affinity: // cd /proc/irq // for i in */smp_affinity; do echo 1 > $i; done