-
-
Save Lewiscowles1986/90191c59c9aedf3d08bf0b129065cccc to your computer and use it in GitHub Desktop.
| // Integer and float benchmark for Win32 and Win64 | |
| // Results are below main(), line 91 | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #ifdef _WIN32 | |
| #include <sys/timeb.h> | |
| #else | |
| #include <sys/time.h> | |
| #endif | |
| #include <time.h> | |
| double | |
| mygettime(void) { | |
| # ifdef _WIN32 | |
| struct _timeb tb; | |
| _ftime(&tb); | |
| return (double)tb.time + (0.001 * (double)tb.millitm); | |
| # else | |
| struct timeval tv; | |
| if(gettimeofday(&tv, 0) < 0) { | |
| perror("oops"); | |
| } | |
| return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec); | |
| # endif | |
| } | |
| template< typename Type > | |
| void my_test(const char* name) { | |
| volatile Type v = 0; | |
| // Do not use constants or repeating values | |
| // to avoid loop unroll optimizations. | |
| // All values >0 to avoid division by 0 | |
| Type v0 = (Type)(rand() % 256)/16 + 1; | |
| Type v1 = (Type)(rand() % 256)/16 + 1; | |
| Type v2 = (Type)(rand() % 256)/16 + 1; | |
| Type v3 = (Type)(rand() % 256)/16 + 1; | |
| Type v4 = (Type)(rand() % 256)/16 + 1; | |
| Type v5 = (Type)(rand() % 256)/16 + 1; | |
| Type v6 = (Type)(rand() % 256)/16 + 1; | |
| Type v7 = (Type)(rand() % 256)/16 + 1; | |
| double t1 = mygettime(); | |
| for (size_t i = 0; i < 100000000; ++i) { | |
| v += v0; | |
| v += v2; | |
| v += v4; | |
| v += v6; | |
| } | |
| printf("%s add: %f\n", name, mygettime() - t1); | |
| t1 = mygettime(); | |
| for (size_t i = 0; i < 100000000; ++i) { | |
| v -= v1; | |
| v -= v3; | |
| v -= v5; | |
| v -= v7; | |
| } | |
| printf("%s sub: %f\n", name, mygettime() - t1); | |
| t1 = mygettime(); | |
| for (size_t i = 0; i < 100000000; ++i) { | |
| v *= v0; | |
| v *= v2; | |
| v *= v4; | |
| v *= v6; | |
| } | |
| printf("%s mul: %f\n", name, mygettime() - t1); | |
| t1 = mygettime(); | |
| for (size_t i = 0; i < 100000000; ++i) { | |
| v /= v1; | |
| v /= v3; | |
| v /= v5; | |
| v /= v7; | |
| } | |
| printf("%s div: %f\n", name, mygettime() - t1); | |
| } | |
| int main() { | |
| my_test< short >(" short"); | |
| my_test< int >(" int"); | |
| my_test< long >(" long"); | |
| my_test< long long >("long long"); | |
| my_test< float >(" float"); | |
| my_test< double >(" double"); | |
| return 0; | |
| } |
Using volatile Type sink = v; inside the loop (after every v *= foo) would force it to store every result separately (so it couldn't optimize the add loop to a single multiply), without having to use volatile v. So the compiler could still keep v in a register.
You can instead use inline asm to force the compiler to have a result in a register without adding any extra instructions, if you don't mind using GNU C extensions. See https://kojirion.github.io/2016/04/04/Profiling.html for an escape function, from Chandler Carruth's CppCon2015 talk: "Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My!" https://www.youtube.com/watch?v=nXaxk27zwlk. (Using perf on Linux.)
Also note that repeated division will quickly make v = 0, which is the fastest case for div / idiv. The other operations don't have data-dependent performance, but divide does on most CPUs. Especially for 64-bit, 0 can be 2x faster than large dividends.
e.g. Agner Fog lists idiv r64 latency for Haswell as 39-103, (and throughput 24-81). (http://agner.org/optimize/). Not such a big effect for 32-bit and smaller, e.g. latency = 22-29 for Haswell idiv r32.
signed overflow is UB, although I don't think the compiler will "see it" at compile time so it shouldn't actually be a problem on normal architectures.
Most of the difference between add and mul is hidden by using
volatileinside the loop; instead of seeing 3x the latency for this dependency chain, you only see5+1vs.5+3.