// Copyright (c) 2021 Francesco Mazzoli // // Permission to use, copy, modify, and distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // // Branchless, vectorized `atan2f`. Various functions of increasing // performance are presented. The fastest version is 50~ faster than libc // on batch workloads, outputing a result every ~2 clock cycles, compared to // ~110 for libc. The functions all use the same `atan` approximation, and their // max error is around ~1/10000 of a degree. // // They also do not handle inf / -inf // and the origin as an input as they should -- in our case these are a sign // that something is wrong anyway. Moreover, manual_2 does not handle NaN // correctly (it drops them silently), and all the auto_ functions do not // handle -0 correctly. But manual_1 handles everything but +-inf and +-0,+-0 // correctly. // // Tested on a Xeon W-2145, a Skylake processor. Compiled with // // $ clang++ --version // clang version 12.0.1 // $ clang++ -static --std=c++20 -march=skylake -O3 -Wall vectorized-atan2f.cpp -lm -o vectorized-atan2f // // Results: // // $ ./vectorized-atan2f --test-edge-cases 100000 100 1000 // Generating data... done. // Tests will read 824kB and write 412kB (103048 points) // Running 1000 warm ups and 1000 iterations // // baseline: 2.46 s, 0.32GB/s, 105.79 cycles/elem, 129.34 instrs/elem, 1.22 instrs/cycle, 27.76 branches/elem, 7.08% branch misses, 0.47% cache misses, 4.29GHz // auto_1: 0.21 s, 3.75GB/s, 8.29 cycles/elem, 8.38 instrs/elem, 1.01 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.89GHz, 0.000109283deg max error, max error point: -0.563291132, -0.544303775 // auto_2: 97.50ms, 8.19GB/s, 3.79 cycles/elem, 5.38 instrs/elem, 1.42 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.01% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 // auto_3: 75.95ms, 10.52GB/s, 2.95 cycles/elem, 4.13 instrs/elem, 1.40 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 // auto_4: 55.89ms, 14.29GB/s, 2.17 cycles/elem, 3.63 instrs/elem, 1.67 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.01% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 // manual_1: 52.57ms, 15.20GB/s, 2.04 cycles/elem, 3.63 instrs/elem, 1.78 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 // manual_2: 50.16ms, 15.93GB/s, 1.95 cycles/elem, 3.88 instrs/elem, 1.99 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.02% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 // // The atan approximation is from sheet 11 of "Approximations for digital // computers", C. Hastings, 1955, a delightful document overall. // // Functions auto_1 to auto_5 are automatically vectorized. manual_1 and manual_2 // are vectorized manually. // // You'll get quite different results with g++. The main problem with g++ is that // it vectorizes less aggressively. However it inserts FMA instructions _more_ // aggresively, which is a plus. clang needs the `fp-contract` pragma // or an explicit `fmaf`. // // Moreover, the #include #include #include #include #include #include #include #include #include #include #include #include #include #define USE_AVX using namespace std; #define NOINLINE __attribute__((noinline)) #define UNUSED __attribute__((unused)) // -------------------------------------------------------------------- // AVX utils #ifdef USE_AVX template inline void assert_avx_aligned(const A* ptr) { if (reinterpret_cast(ptr) % 32 != 0) { cerr << "Pointer " << ptr << " is not 32-byte aligned, exiting" << endl; exit(EXIT_FAILURE); } } inline void assert_multiple_of_8(size_t num) { if (num % 8 != 0) { cerr << "Array size " << num << " is not a multiple of 8, exiting" << endl; exit(EXIT_FAILURE); } } #endif // -------------------------------------------------------------------- // functions NOINLINE static void atan2_baseline(size_t cases, const float* ys, const float* xs, float* out) { for (size_t i = 0; i < cases; i++) { out[i] = atan2f(ys[i], xs[i]); } } // not tested since it is very slow. NOINLINE UNUSED static void atan2_fpatan(size_t cases, const float* ys, const float* xs, float* out) { for (size_t i = 0; i < cases; i++) { asm ( "flds (%[ys], %[i], 4)\n" "flds (%[xs], %[i], 4)\n" "fpatan\n" "fstps (%[out], %[i], 4)\n" : : [ys]"r"(ys), [xs]"r"(xs), [out]"r"(out), [i]"r"(i) ); } } // Polynomial approximation of atan between [-1, 1]. Stated max error ~0.000001rad. // See comment at the beginning of file for source. inline float atan_approximation(float x) { float a1 = 0.99997726f; float a3 = -0.33262347f; float a5 = 0.19354346f; float a7 = -0.11643287f; float a9 = 0.05265332f; float a11 = -0.01172120f; float x_sq = x*x; return x * (a1 + x_sq * (a3 + x_sq * (a5 + x_sq * (a7 + x_sq * (a9 + x_sq * a11))))); } // First automatic version: naive translation of the maths NOINLINE static void atan2_auto_1(size_t num_points, const float* ys, const float* xs, float* out) { for (size_t i = 0; i < num_points; i++) { // Ensure input is in [-1, +1] float y = ys[i]; float x = xs[i]; bool swap = fabs(x) < fabs(y); float atan_input = (swap ? x : y) / (swap ? y : x); // Approximate atan float res = atan_approximation(atan_input); // If swapped, adjust atan output res = swap ? (atan_input >= 0.0f ? M_PI_2 : -M_PI_2) - res : res; // Adjust quadrants if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant else if (x < 0.0f && y >= 0.0f) { res = M_PI + res; } // 2nd quadrant else if (x < 0.0f && y < 0.0f) { res = -M_PI + res; } // 3rd quadrant else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant // Store result out[i] = res; } } // Second automatic version: get rid of casting to double NOINLINE static void atan2_auto_2(size_t num_points, const float* ys, const float* xs, float* out) { float pi = M_PI; float pi_2 = M_PI_2; for (size_t i = 0; i < num_points; i++) { // Ensure input is in [-1, +1] float y = ys[i]; float x = xs[i]; bool swap = fabs(x) < fabs(y); float atan_input = (swap ? x : y) / (swap ? y : x); // Approximate atan float res = atan_approximation(atan_input); // If swapped, adjust atan output res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res; // Adjust quadrants if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant else if (x < 0.0f && y >= 0.0f) { res = pi + res; } // 2nd quadrant else if (x < 0.0f && y < 0.0f) { res = -pi + res; } // 3rd quadrant else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant // Store result out[i] = res; } } // Third automatic version: perform positive check for x and y once -- the compiler // can't assume that in the presence of NaNs since `0/0 >= 0` is false and `0/0 < 0` is also // false. NOINLINE static void atan2_auto_3(size_t num_points, const float* ys, const float* xs, float* out) { float pi = M_PI; float pi_2 = M_PI_2; for (size_t i = 0; i < num_points; i++) { // Ensure input is in [-1, +1] float y = ys[i]; float x = xs[i]; bool swap = fabs(x) < fabs(y); float atan_input = (swap ? x : y) / (swap ? y : x); // Approximate atan float res = atan_approximation(atan_input); // If swapped, adjust atan output res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res; // Adjust the result depending on the input quadrant if (x < 0.0f) { res = (y >= 0.0f ? pi : -pi) + res; } // Store result out[i] = res; } } inline float atan_fma_approximation(float x) { float a1 = 0.99997726f; float a3 = -0.33262347f; float a5 = 0.19354346f; float a7 = -0.11643287f; float a9 = 0.05265332f; float a11 = -0.01172120f; // Compute approximation using Horner's method float x_sq = x*x; return x * fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, a11, a9), a7), a5), a3), a1); } // Fifth automatic version: use FMA for the polynomial NOINLINE static void atan2_auto_4(size_t num_points, const float* ys, const float* xs, float* out) { float pi = M_PI; float pi_2 = M_PI_2; for (size_t i = 0; i < num_points; i++) { // Ensure input is in [-1, +1] float y = ys[i]; float x = xs[i]; bool swap = fabs(x) < fabs(y); float atan_input = (swap ? x : y) / (swap ? y : x); // Approximate atan float res = atan_fma_approximation(atan_input); // If swapped, adjust atan output res = swap ? copysignf(pi_2, atan_input) - res : res; // Adjust the result depending on the input quadrant if (x < 0.0f) { res = copysignf(pi, y) + res; } // Store result out[i] = res; } } #ifdef USE_AVX inline __m256 atan_avx_approximation(__m256 x) { __m256 a1 = _mm256_set1_ps( 0.99997726f); __m256 a3 = _mm256_set1_ps(-0.33262347f); __m256 a5 = _mm256_set1_ps( 0.19354346f); __m256 a7 = _mm256_set1_ps(-0.11643287f); __m256 a9 = _mm256_set1_ps( 0.05265332f); __m256 a11 = _mm256_set1_ps(-0.01172120f); __m256 x_sq = _mm256_mul_ps(x, x); __m256 result; result = a11; result = _mm256_fmadd_ps(x_sq, result, a9); result = _mm256_fmadd_ps(x_sq, result, a7); result = _mm256_fmadd_ps(x_sq, result, a5); result = _mm256_fmadd_ps(x_sq, result, a3); result = _mm256_fmadd_ps(x_sq, result, a1); result = _mm256_mul_ps(x, result); return result; } // First manual version: straightfoward translation of atan2_auto_5 NOINLINE static void atan2_manual_1(size_t num_points, const float* ys, const float* xs, float* out) { // Check that the input plays well with AVX assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out); // Store pi and pi/2 as constants const __m256 pi = _mm256_set1_ps(M_PI); const __m256 pi_2 = _mm256_set1_ps(M_PI_2); // Create bit masks that we will need. // The first one is all 1s except from the sign bit: // // 01111111111111111111111111111111 // // We can use it to make a float absolute by AND'ing with it. const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));; // The second is only the sign bit: // // 10000000000000000000000000000000 // // we can use it to extract the sign of a number by AND'ing with it. const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); for (size_t i = 0; i < num_points; i += 8) { // Load 8 elements from `ys` and `xs` into two vectors. __m256 y = _mm256_load_ps(&ys[i]); __m256 x = _mm256_load_ps(&xs[i]); // Compare |y| > |x|. This will create an 8-vector of floats that we can // use as a mask: the elements where the respective comparison is true will be filled // with 1s, with 0s where the comparison is false. // // For example, if we were comparing the vectors // // 5 -5 5 -5 5 -5 5 -5 // > // -5 5 -5 5 -5 5 -5 5 // = // 1s 0s 1s 0s 1s 0s 1s 0s // // Where `1s = 0xFFFFFFFF` and `0s = 0x00000000`. __m256 swap_mask = _mm256_cmp_ps( _mm256_and_ps(y, abs_mask), // |y| _mm256_and_ps(x, abs_mask), // |x| _CMP_GT_OS ); // Create the atan input by "blending" `y` and `x`, according to the mask computed // above. The blend instruction will pick the first or second argument based on // the mask we passed in. In our case we need the number of larger magnitude to // be the denominator. __m256 atan_input = _mm256_div_ps( _mm256_blendv_ps(y, x, swap_mask), // pick the lowest between |y| and |x| for each number _mm256_blendv_ps(x, y, swap_mask) // and the highest. ); // Approximate atan __m256 result = atan_avx_approximation(atan_input); // If swapped, adjust atan output. We use blending again to leave // the output unchanged if we didn't swap anything. // // If we need to adjust it, we simply carry the sign over from the input // to `pi_2` by using the `sign_mask`. This avoids a more expensive comparison, // and also handles edge cases such as -0 better. result = _mm256_blendv_ps( result, _mm256_sub_ps( _mm256_or_ps(pi_2, _mm256_and_ps(atan_input, sign_mask)), result ), swap_mask ); // Adjust the result depending on the input quadrant. // // We create a mask for the sign of `x` using an arithmetic right shift: // the mask will be all 0s if the sign if positive, and all 1s // if the sign is negative. __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31)); // Then use the mask to perform the adjustment only when the sign // if positive, and use the sign bit of `y` to know whether to add // `pi` or `-pi`. result = _mm256_add_ps( _mm256_and_ps( _mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask ), result ); // Store result _mm256_store_ps(&out[i], result); } } // Second manual version: use the abs values we get at the beginning // for more ILP (see comment below) NOINLINE static void atan2_manual_2(size_t num_points, const float* ys, const float* xs, float* out) { assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out); const __m256 pi = _mm256_set1_ps(M_PI); const __m256 pi_2 = _mm256_set1_ps(M_PI_2); // no sign bit -- AND with this to get absolute value const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));; // only sign bit -- XOR with this to get negative const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); for (size_t i = 0; i < num_points; i += 8) { __m256 y = _mm256_load_ps(&ys[i]); __m256 x = _mm256_load_ps(&xs[i]); __m256 abs_y = _mm256_and_ps(abs_mask, y); __m256 abs_x = _mm256_and_ps(abs_mask, x); // atan_input = min(|y|, |x|) / max(|y|, |x|) // // I've experimented with using blend rather than min, given that we // compute `swap_mask` later anyway, but that delays the atan // computation by 2+ cycles, and is less parallel, since on skylake: // // latency throughput // vminps 4 0.5 // vmaxps 4 0.5 // vblendvps 2 0.66 // vcmpps 4 0.5 // // So while it decreases the numbers of instructions needed it makes // the overall function slower. // // This has the unfortunate effect of silently dropping NaNs in the // inputs, since in case of NaNs these instructions just return // the second argument. __m256 atan_input = _mm256_div_ps( _mm256_min_ps(abs_x, abs_y), _mm256_max_ps(abs_x, abs_y) ); // Approximate atan __m256 result = atan_avx_approximation(atan_input); // We first do the usual +- pi - res, but in this case // we know the sign of pi since the input is always positive. // // result = (abs_y > abs_x) ? pi - res : res; __m256 swap_mask = _mm256_cmp_ps(abs_y, abs_x, _CMP_GT_OQ); result = _mm256_add_ps( _mm256_xor_ps(result, _mm256_and_ps(sign_mask, swap_mask)), _mm256_and_ps(pi_2, swap_mask) ); // We now have to adjust the quadrant slightly differently: // // * If both values are positive, do nothing; // * If y is negative and x is positive, do nothing; // * If y is positive and x is negative, do pi + result; // * If y is negative and x is negative, do result - pi; // // These can easily be verified by analyzing what happens to the output // for each input quadrant. // // These cases can be compressed to the two branches below, and then // made branchless without using any blend instruction. // result = (x < 0) ? pi - res : res; __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31)); result = _mm256_add_ps( _mm256_xor_ps(result, _mm256_and_ps(x, sign_mask)), _mm256_and_ps(pi, x_sign_mask) ); // result = (y < 0) ? -res : res; result = _mm256_xor_ps(result, _mm256_and_ps(y, sign_mask)); // Store result _mm256_store_ps(&out[i], result); } } #endif // -------------------------------------------------------------------- // data generation NOINLINE static void generate_data( bool random_points, bool test_edge_cases, size_t desired_num_cases, size_t* cases, float** ys, float** xs, float** ref_out, float** out ) { cout << "Generating data..." << flush; size_t edge = static_cast(sqrtf(static_cast(desired_num_cases))); vector extra_cases; if (test_edge_cases) { // We want to make sure to test some special cases, so we always add them. // We do not include negative zero, since we do not care about the sign // matching up in that case. // We also do not include NaN and inf, since we assume that the input does // not contain it. using limits = numeric_limits; extra_cases = { 1.0f, -1.0f, 0.0f, -0.0f, limits::epsilon(), -limits::epsilon(), limits::max(), -limits::max(), limits::quiet_NaN() }; } // The -1 is for the 0, 0 case, which we do not want. *cases = (edge+extra_cases.size())*(edge+extra_cases.size()) - 1; // Make sure cases is a multiple of 8 so that the AVX functions can work cleanly *cases = *cases + (8 - (*cases % 8)); *ys = reinterpret_cast(aligned_alloc(32, *cases * sizeof(float))); *xs = reinterpret_cast(aligned_alloc(32, *cases * sizeof(float))); *ref_out = reinterpret_cast(aligned_alloc(32, *cases * sizeof(float))); *out = reinterpret_cast(aligned_alloc(32, *cases * sizeof(float))); if (*ys == nullptr || *xs == nullptr || *ref_out == nullptr || *out == nullptr) { cerr << "Could not allocate arrays" << endl; exit(EXIT_FAILURE); } mt19937_64 gen(0); { float bound = 1.0f; float step = (bound * 2.0f) / static_cast(edge); uniform_real_distribution dist{ -1.0f, 1.0f }; const auto get_number = [random_points, &extra_cases, &gen, bound, step, &dist](size_t i) -> float { if (i < extra_cases.size()) { return extra_cases.at(i); } else if (random_points) { return dist(gen); } else { return -bound + static_cast(i - extra_cases.size()) * step; } }; size_t ix = 0; for (size_t i = 0; i < edge + extra_cases.size(); i++) { for (size_t j = 0; j < edge + extra_cases.size(); j++) { float y = get_number(i); float x = get_number(j); if (y == 0.0f && x == 0.0f) { continue; } (*ys)[ix] = y; (*xs)[ix] = x; ix++; } } // pad with dummies for (; ix < *cases; ix++) { (*ys)[ix] = 1.0f; (*xs)[ix] = 1.0f; } } // shuffle to confuse branch predictor in the case of explicit branches and // non-random points { for (size_t i = 0; i < *cases - 1; i++) { size_t swap_with = static_cast(i + 1 + (gen() % (*cases - i - 1))); swap((*ys)[i], (*ys)[swap_with]); swap((*xs)[i], (*xs)[swap_with]); } } cout << " done." << endl; } // -------------------------------------------------------------------- // Pin to first CPU static void pin_to_cpu_0(void) { cpu_set_t cpu_mask; CPU_ZERO(&cpu_mask); CPU_SET(0, &cpu_mask); if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) { fprintf(stderr, "Could not set CPU affinity\n"); exit(EXIT_FAILURE); } } // -------------------------------------------------------------------- // perf instrumentation -- a mixture of man 3 perf_event_open and // static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { int ret; ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); return ret; } static void setup_perf_event( struct perf_event_attr *evt, int *fd, uint64_t *id, uint32_t evt_type, uint64_t evt_config, int group_fd ) { memset(evt, 0, sizeof(struct perf_event_attr)); evt->type = evt_type; evt->size = sizeof(struct perf_event_attr); evt->config = evt_config; evt->disabled = 1; evt->exclude_kernel = 1; evt->exclude_hv = 1; evt->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; *fd = perf_event_open(evt, 0, -1, group_fd, 0); if (*fd == -1) { fprintf(stderr, "Error opening leader %llx\n", evt->config); exit(EXIT_FAILURE); } ioctl(*fd, PERF_EVENT_IOC_ID, id); } static struct perf_event_attr perf_cycles_evt; static int perf_cycles_fd; static uint64_t perf_cycles_id; static struct perf_event_attr perf_clock_evt; static int perf_clock_fd; static uint64_t perf_clock_id; static struct perf_event_attr perf_instrs_evt; static int perf_instrs_fd; static uint64_t perf_instrs_id; static struct perf_event_attr perf_cache_misses_evt; static int perf_cache_misses_fd; static uint64_t perf_cache_misses_id; static struct perf_event_attr perf_cache_references_evt; static int perf_cache_references_fd; static uint64_t perf_cache_references_id; static struct perf_event_attr perf_branch_misses_evt; static int perf_branch_misses_fd; static uint64_t perf_branch_misses_id; static struct perf_event_attr perf_branch_instructions_evt; static int perf_branch_instructions_fd; static uint64_t perf_branch_instructions_id; static void perf_init(void) { // Cycles setup_perf_event( &perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, -1 ); // Clock setup_perf_event( &perf_clock_evt, &perf_clock_fd, &perf_clock_id, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd ); // Instructions setup_perf_event( &perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd ); // Cache misses setup_perf_event( &perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd ); // Cache references setup_perf_event( &perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd ); // Branch misses setup_perf_event( &perf_branch_misses_evt, &perf_branch_misses_fd, &perf_branch_misses_id, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, perf_cycles_fd ); // Branch instructions setup_perf_event( &perf_branch_instructions_evt, &perf_branch_instructions_fd, &perf_branch_instructions_id, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, perf_cycles_fd ); } static void perf_close(void) { close(perf_clock_fd); close(perf_cycles_fd); close(perf_instrs_fd); close(perf_cache_misses_fd); close(perf_cache_references_fd); } static void disable_perf_count(void) { ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); } static void enable_perf_count(void) { ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); } static void reset_perf_count(void) { ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); } struct perf_read_value { uint64_t value; uint64_t id; }; struct perf_read_format { uint64_t nr; struct perf_read_value values[]; }; static char perf_read_buf[4096]; struct perf_count { uint64_t cycles; double seconds; uint64_t instructions; uint64_t cache_misses; uint64_t cache_references; uint64_t branch_misses; uint64_t branch_instructions; perf_count(): cycles(0), seconds(0.0), instructions(0), cache_misses(0), cache_references(0), branch_misses(0), branch_instructions(0) {} }; static void read_perf_count(struct perf_count *count) { if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) { fprintf(stderr, "Could not read cycles from perf\n"); exit(EXIT_FAILURE); } struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf; if (rf->nr != 7) { fprintf(stderr, "Bad number of perf events\n"); exit(EXIT_FAILURE); } for (int i = 0; i < static_cast(rf->nr); i++) { struct perf_read_value *value = &rf->values[i]; if (value->id == perf_cycles_id) { count->cycles = value->value; } else if (value->id == perf_clock_id) { count->seconds = ((double) (value->value / 1000ull)) / 1000000.0; } else if (value->id == perf_instrs_id) { count->instructions = value->value; } else if (value->id == perf_cache_misses_id) { count->cache_misses = value->value; } else if (value->id == perf_cache_references_id) { count->cache_references = value->value; } else if (value->id == perf_branch_misses_id) { count->branch_misses = value->value; } else if (value->id == perf_branch_instructions_id) { count->branch_instructions = value->value; } else { fprintf(stderr, "Spurious value in perf read (%ld)\n", value->id); exit(EXIT_FAILURE); } } } // -------------------------------------------------------------------- // running the function struct max_error { float y; float x; float error; max_error(): y(0.0f), x(0.0f), error(0.0f) {} void update(float y, float x, float reference, float result) { if (isnan(reference) && !isnan(result)) { cerr << "Expected NaN in result, but got " << result << endl; cerr << "For point " << x << ", " << y << endl; exit(EXIT_FAILURE); } if (!isnan(reference) && isnan(result)) { cerr << "Unexpected NaN in result, expected " << reference << endl; cerr << "For point " << x << ", " << y << endl; exit(EXIT_FAILURE); } float error = abs(reference - result); if (error > this->error) { this->y = y; this->x = x; this->error = error; } } }; NOINLINE static void run_timed( int pre_iterations, int iterations, bool test_negative_zero, bool test_max, bool test_nan, size_t num, const string& name, void (*fun)(size_t, const float*, const float*, float*), const float* ref_out, // if null no comparison will be run const float* in_y, const float* in_x, float* out ) { if (iterations < 1) { fprintf(stderr, "iterations < 1: %d\n", iterations); exit(EXIT_FAILURE); } for (int i = 0; i < pre_iterations; i++) { fun(num, in_y, in_x, out); } struct perf_count counts; disable_perf_count(); reset_perf_count(); enable_perf_count(); // The enabling / disabling adds noise for small timings, so // wrap the loop since the loop counts for very little (one add, one cmp, one // predicted conditional jump). for (int i = 0; i < iterations; i++) { fun(num, in_y, in_x, out); } disable_perf_count(); read_perf_count(&counts); // two floats per element uint64_t bytes = ((uint64_t) num) * sizeof(float) * 2; double gb_per_s = (((double) bytes) / 1000000000.0) / (counts.seconds / ((double) iterations)); double time = counts.seconds; const char *unit = " s"; if (time < 0.1) { time *= 1000.0; unit = "ms"; } if (time < 0.1) { time *= 1000.0; unit = "us"; } constexpr int padded_name_size = 10; char padded_name[padded_name_size]; int name_len = snprintf(padded_name, padded_name_size, "%s:", name.c_str()); for (int i = name_len; i < padded_name_size; i++) { padded_name[i] = ' '; } padded_name[padded_name_size-1] = '\0'; printf( "%s %5.2f%s, %6.2fGB/s, %6.2f cycles/elem, %6.2f instrs/elem, %5.2f instrs/cycle, %5.2f branches/elem, %5.2f%% branch misses, %5.2f%% cache misses, %5.2fGHz", padded_name, time, unit, gb_per_s, ((double) counts.cycles) / ((double) iterations) / ((double) num), ((double) counts.instructions) / ((double) iterations) / ((double) num), ((double) counts.instructions) / ((double) counts.cycles), ((double) counts.branch_instructions) / ((double) iterations) / ((double) num), 100.0 * ((double) counts.branch_misses) / ((double) counts.branch_instructions), 100.0 * ((double) counts.cache_misses) / ((double) counts.cache_references), ((double) counts.cycles) / counts.seconds / 1000000000.0 ); if (ref_out != nullptr) { max_error max_error; for (size_t ix = 0; ix < num; ix++) { float y = in_y[ix]; float x = in_x[ix]; if (!test_negative_zero && (y == -0.0f || x == -0.0f)) { continue; } if ( !test_max && (fabs(y) == numeric_limits::max() || fabs(x) == numeric_limits::max()) ) { continue; } if (!test_nan && (isnan(y) || isnan(x))) { continue; } max_error.update(in_y[ix], in_x[ix], ref_out[ix], out[ix]); } printf( ", %.9fdeg max error, max error point: %+.9f, %+.9f\n", max_error.error * 180.0f / M_PI, max_error.x, max_error.y ); } else { printf("\n"); } } #define run_timed_atan2(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, fun, ref_out, in_y, in_x, out) \ run_timed(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, #fun, atan2_ ## fun, ref_out, in_y, in_x, out) // -------------------------------------------------------------------- // main NOINLINE static pair format_size(size_t size) { string unit = "B"; if (size > 10000ull) { size /= 1000ull; unit = "kB"; } if (size > 10000ull) { size /= 1000ull; unit = "MB"; } if (size > 10000ull) { size /= 1000ull; unit = "GB"; } return { unit, size }; } struct config { bool random_points = false; bool test_edge_cases = false; bool test_baseline = true; bool test_auto_1 = true; bool test_auto_2 = true; bool test_auto_3 = true; bool test_auto_4 = true; bool test_manual_1 = true; bool test_manual_2 = true; }; int main(int argc, const char* argv[]) { cout.precision(numeric_limits::max_digits10); pin_to_cpu_0(); perf_init(); config config; const auto bad_usage = [&argv]() { cerr << "Usage: " << argv[0] << " [--random-points] [--test-edge-cases] [--functions COMMA_SEPARATED_FUNCTIONS] NUMBER_OF_TEST_CASES NUMBER_OF_WARM_UPS NUMBER_OF_ITERATIONS" << endl; exit(EXIT_FAILURE); }; vector arguments; for (int i = 1; i < argc; i++) { string arg{argv[i]}; if (arg == "--random-points") { config.random_points = true; } else if (arg == "--test-edge-cases") { config.test_edge_cases = true; } else if (arg == "--functions") { config.test_baseline = false; config.test_auto_1 = false; config.test_auto_2 = false; config.test_auto_3 = false; config.test_auto_4 = false; config.test_manual_1 = false; config.test_manual_2 = false; if (i < argc - 1) { i++; stringstream functions{ argv[i] }; for (string function; getline(functions, function, ','); ) { if (function == "baseline") { config.test_baseline = true; } else if (function == "auto_1") { config.test_auto_1 = true; } else if (function == "auto_2") { config.test_auto_2 = true; } else if (function == "auto_3") { config.test_auto_3 = true; } else if (function == "auto_4") { config.test_auto_4 = true; } #ifdef USE_AVX else if (function == "manual") { config.test_manual_1 = true; } else if (function == "manual_2") { config.test_manual_2 = true; } #endif else { cerr << "Bad function " << function << endl; bad_usage(); } } } else { cerr << "Expecting argument after --functions" << endl; bad_usage(); } } else { arguments.emplace_back(arg); } } if (arguments.size() != 3) { bad_usage(); } size_t desired_cases; if (sscanf(arguments.at(0).c_str(), "%zu", &desired_cases) != 1) { cerr << "Could not parse number of cases" << endl; bad_usage(); } int pre_iterations; if (sscanf(arguments.at(1).c_str(), "%d", &pre_iterations) != 1) { cerr << "Could not parse warm ups" << endl; bad_usage(); } int iterations; if (sscanf(arguments.at(2).c_str(), "%d", &iterations) != 1) { cerr << "Could not parse iterations" << endl; bad_usage(); } size_t cases; float* ys; float* xs; float* ref_out; float* out; generate_data( config.random_points, config.test_edge_cases, desired_cases, &cases, &ys, &xs, &ref_out, &out ); if (!config.test_baseline) { free(ref_out); ref_out = nullptr; } const auto formatted_input_size = format_size(sizeof(float) * 2 * cases); const auto formatted_output_size = format_size(sizeof(float) * cases); cout << "Tests will read " << formatted_input_size.second << formatted_input_size.first << " and write " << formatted_output_size.second << formatted_output_size.first << " (" << cases << " points)" << endl; cout << "Running " << pre_iterations << " warm ups and " << iterations << " iterations" << endl; cout << endl; if (config.test_baseline) { run_timed(pre_iterations, iterations, true, true, true, cases, "baseline", atan2_baseline, nullptr, ys, xs, ref_out); } // auto_1 to auto_3 do not support the max value because the input gets // reduce to negative zero in that case and the atan_input >= 0 branch // doesn't preserve the sign properly. // // Similarly, none of the functions apart from manual_1 and manual_2 support // negative zero properly because of the x < 0 branch. // // manual_2 silently drop NaNs, it can be made to not do that at slight // performance hit (see comment in it). if (config.test_auto_1) { run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_1, ref_out, ys, xs, out); } if (config.test_auto_2) { run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_2, ref_out, ys, xs, out); } if (config.test_auto_3) { run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_3, ref_out, ys, xs, out); } if (config.test_auto_4) { run_timed_atan2(pre_iterations, iterations, false, true, true, cases, auto_4, ref_out, ys, xs, out); } #ifdef USE_AVX if (config.test_manual_1) { run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_1, ref_out, ys, xs, out); } if (config.test_manual_2) { run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_2, ref_out, ys, xs, out); } #endif cout << endl; free(ys); free(xs); free(ref_out); free(out); perf_close(); return EXIT_SUCCESS; }