-
-
Save bitonic/d0f5a0a44e37d4f0be03d34d47acb6cf to your computer and use it in GitHub Desktop.
| // Copyright (c) 2021 Francesco Mazzoli <[email protected]> | |
| // | |
| // Permission to use, copy, modify, and distribute this software for any | |
| // purpose with or without fee is hereby granted, provided that the above | |
| // copyright notice and this permission notice appear in all copies. | |
| // | |
| // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
| // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
| // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
| // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
| // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
| // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
| // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
| // | |
| // Branchless, vectorized `atan2f`. Various functions of increasing | |
| // performance are presented. The fastest version is 50~ faster than libc | |
| // on batch workloads, outputing a result every ~2 clock cycles, compared to | |
| // ~110 for libc. The functions all use the same `atan` approximation, and their | |
| // max error is around ~1/10000 of a degree. | |
| // | |
| // They also do not handle inf / -inf | |
| // and the origin as an input as they should -- in our case these are a sign | |
| // that something is wrong anyway. Moreover, manual_2 does not handle NaN | |
| // correctly (it drops them silently), and all the auto_ functions do not | |
| // handle -0 correctly. But manual_1 handles everything but +-inf and +-0,+-0 | |
| // correctly. | |
| // | |
| // Tested on a Xeon W-2145, a Skylake processor. Compiled with | |
| // | |
| // $ clang++ --version | |
| // clang version 12.0.1 | |
| // $ clang++ -static --std=c++20 -march=skylake -O3 -Wall vectorized-atan2f.cpp -lm -o vectorized-atan2f | |
| // | |
| // Results: | |
| // | |
| // $ ./vectorized-atan2f --test-edge-cases 100000 100 1000 | |
| // Generating data... done. | |
| // Tests will read 824kB and write 412kB (103048 points) | |
| // Running 1000 warm ups and 1000 iterations | |
| // | |
| // baseline: 2.46 s, 0.32GB/s, 105.79 cycles/elem, 129.34 instrs/elem, 1.22 instrs/cycle, 27.76 branches/elem, 7.08% branch misses, 0.47% cache misses, 4.29GHz | |
| // auto_1: 0.21 s, 3.75GB/s, 8.29 cycles/elem, 8.38 instrs/elem, 1.01 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.89GHz, 0.000109283deg max error, max error point: -0.563291132, -0.544303775 | |
| // auto_2: 97.50ms, 8.19GB/s, 3.79 cycles/elem, 5.38 instrs/elem, 1.42 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.01% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
| // auto_3: 75.95ms, 10.52GB/s, 2.95 cycles/elem, 4.13 instrs/elem, 1.40 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
| // auto_4: 55.89ms, 14.29GB/s, 2.17 cycles/elem, 3.63 instrs/elem, 1.67 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.01% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
| // manual_1: 52.57ms, 15.20GB/s, 2.04 cycles/elem, 3.63 instrs/elem, 1.78 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
| // manual_2: 50.16ms, 15.93GB/s, 1.95 cycles/elem, 3.88 instrs/elem, 1.99 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.02% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
| // | |
| // The atan approximation is from sheet 11 of "Approximations for digital | |
| // computers", C. Hastings, 1955, a delightful document overall. | |
| // | |
| // Functions auto_1 to auto_5 are automatically vectorized. manual_1 and manual_2 | |
| // are vectorized manually. | |
| // | |
| // You'll get quite different results with g++. The main problem with g++ is that | |
| // it vectorizes less aggressively. However it inserts FMA instructions _more_ | |
| // aggresively, which is a plus. clang needs the `fp-contract` pragma | |
| // or an explicit `fmaf`. | |
| // | |
| // Moreover, the | |
| #include <cmath> | |
| #include <algorithm> | |
| #include <iostream> | |
| #include <limits> | |
| #include <immintrin.h> | |
| #include <vector> | |
| #include <unistd.h> | |
| #include <sys/ioctl.h> | |
| #include <linux/perf_event.h> | |
| #include <string.h> | |
| #include <asm/unistd.h> | |
| #include <random> | |
| #include <sstream> | |
| #define USE_AVX | |
| using namespace std; | |
| #define NOINLINE __attribute__((noinline)) | |
| #define UNUSED __attribute__((unused)) | |
| // -------------------------------------------------------------------- | |
| // AVX utils | |
| #ifdef USE_AVX | |
| template<typename A> | |
| inline void assert_avx_aligned(const A* ptr) { | |
| if (reinterpret_cast<uintptr_t>(ptr) % 32 != 0) { | |
| cerr << "Pointer " << ptr << " is not 32-byte aligned, exiting" << endl; | |
| exit(EXIT_FAILURE); | |
| } | |
| } | |
| inline void assert_multiple_of_8(size_t num) { | |
| if (num % 8 != 0) { | |
| cerr << "Array size " << num << " is not a multiple of 8, exiting" << endl; | |
| exit(EXIT_FAILURE); | |
| } | |
| } | |
| #endif | |
| // -------------------------------------------------------------------- | |
| // functions | |
| NOINLINE | |
| static void atan2_baseline(size_t cases, const float* ys, const float* xs, float* out) { | |
| for (size_t i = 0; i < cases; i++) { | |
| out[i] = atan2f(ys[i], xs[i]); | |
| } | |
| } | |
| // not tested since it is very slow. | |
| NOINLINE UNUSED | |
| static void atan2_fpatan(size_t cases, const float* ys, const float* xs, float* out) { | |
| for (size_t i = 0; i < cases; i++) { | |
| asm ( | |
| "flds (%[ys], %[i], 4)\n" | |
| "flds (%[xs], %[i], 4)\n" | |
| "fpatan\n" | |
| "fstps (%[out], %[i], 4)\n" | |
| : | |
| : [ys]"r"(ys), [xs]"r"(xs), [out]"r"(out), [i]"r"(i) | |
| ); | |
| } | |
| } | |
| // Polynomial approximation of atan between [-1, 1]. Stated max error ~0.000001rad. | |
| // See comment at the beginning of file for source. | |
| inline float atan_approximation(float x) { | |
| float a1 = 0.99997726f; | |
| float a3 = -0.33262347f; | |
| float a5 = 0.19354346f; | |
| float a7 = -0.11643287f; | |
| float a9 = 0.05265332f; | |
| float a11 = -0.01172120f; | |
| float x_sq = x*x; | |
| return | |
| x * (a1 + x_sq * (a3 + x_sq * (a5 + x_sq * (a7 + x_sq * (a9 + x_sq * a11))))); | |
| } | |
| // First automatic version: naive translation of the maths | |
| NOINLINE | |
| static void atan2_auto_1(size_t num_points, const float* ys, const float* xs, float* out) { | |
| for (size_t i = 0; i < num_points; i++) { | |
| // Ensure input is in [-1, +1] | |
| float y = ys[i]; | |
| float x = xs[i]; | |
| bool swap = fabs(x) < fabs(y); | |
| float atan_input = (swap ? x : y) / (swap ? y : x); | |
| // Approximate atan | |
| float res = atan_approximation(atan_input); | |
| // If swapped, adjust atan output | |
| res = swap ? (atan_input >= 0.0f ? M_PI_2 : -M_PI_2) - res : res; | |
| // Adjust quadrants | |
| if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant | |
| else if (x < 0.0f && y >= 0.0f) { res = M_PI + res; } // 2nd quadrant | |
| else if (x < 0.0f && y < 0.0f) { res = -M_PI + res; } // 3rd quadrant | |
| else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant | |
| // Store result | |
| out[i] = res; | |
| } | |
| } | |
| // Second automatic version: get rid of casting to double | |
| NOINLINE | |
| static void atan2_auto_2(size_t num_points, const float* ys, const float* xs, float* out) { | |
| float pi = M_PI; | |
| float pi_2 = M_PI_2; | |
| for (size_t i = 0; i < num_points; i++) { | |
| // Ensure input is in [-1, +1] | |
| float y = ys[i]; | |
| float x = xs[i]; | |
| bool swap = fabs(x) < fabs(y); | |
| float atan_input = (swap ? x : y) / (swap ? y : x); | |
| // Approximate atan | |
| float res = atan_approximation(atan_input); | |
| // If swapped, adjust atan output | |
| res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res; | |
| // Adjust quadrants | |
| if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant | |
| else if (x < 0.0f && y >= 0.0f) { res = pi + res; } // 2nd quadrant | |
| else if (x < 0.0f && y < 0.0f) { res = -pi + res; } // 3rd quadrant | |
| else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant | |
| // Store result | |
| out[i] = res; | |
| } | |
| } | |
| // Third automatic version: perform positive check for x and y once -- the compiler | |
| // can't assume that in the presence of NaNs since `0/0 >= 0` is false and `0/0 < 0` is also | |
| // false. | |
| NOINLINE | |
| static void atan2_auto_3(size_t num_points, const float* ys, const float* xs, float* out) { | |
| float pi = M_PI; | |
| float pi_2 = M_PI_2; | |
| for (size_t i = 0; i < num_points; i++) { | |
| // Ensure input is in [-1, +1] | |
| float y = ys[i]; | |
| float x = xs[i]; | |
| bool swap = fabs(x) < fabs(y); | |
| float atan_input = (swap ? x : y) / (swap ? y : x); | |
| // Approximate atan | |
| float res = atan_approximation(atan_input); | |
| // If swapped, adjust atan output | |
| res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res; | |
| // Adjust the result depending on the input quadrant | |
| if (x < 0.0f) { | |
| res = (y >= 0.0f ? pi : -pi) + res; | |
| } | |
| // Store result | |
| out[i] = res; | |
| } | |
| } | |
| inline float atan_fma_approximation(float x) { | |
| float a1 = 0.99997726f; | |
| float a3 = -0.33262347f; | |
| float a5 = 0.19354346f; | |
| float a7 = -0.11643287f; | |
| float a9 = 0.05265332f; | |
| float a11 = -0.01172120f; | |
| // Compute approximation using Horner's method | |
| float x_sq = x*x; | |
| return | |
| x * fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, a11, a9), a7), a5), a3), a1); | |
| } | |
| // Fifth automatic version: use FMA for the polynomial | |
| NOINLINE | |
| static void atan2_auto_4(size_t num_points, const float* ys, const float* xs, float* out) { | |
| float pi = M_PI; | |
| float pi_2 = M_PI_2; | |
| for (size_t i = 0; i < num_points; i++) { | |
| // Ensure input is in [-1, +1] | |
| float y = ys[i]; | |
| float x = xs[i]; | |
| bool swap = fabs(x) < fabs(y); | |
| float atan_input = (swap ? x : y) / (swap ? y : x); | |
| // Approximate atan | |
| float res = atan_fma_approximation(atan_input); | |
| // If swapped, adjust atan output | |
| res = swap ? copysignf(pi_2, atan_input) - res : res; | |
| // Adjust the result depending on the input quadrant | |
| if (x < 0.0f) { | |
| res = copysignf(pi, y) + res; | |
| } | |
| // Store result | |
| out[i] = res; | |
| } | |
| } | |
| #ifdef USE_AVX | |
| inline __m256 atan_avx_approximation(__m256 x) { | |
| __m256 a1 = _mm256_set1_ps( 0.99997726f); | |
| __m256 a3 = _mm256_set1_ps(-0.33262347f); | |
| __m256 a5 = _mm256_set1_ps( 0.19354346f); | |
| __m256 a7 = _mm256_set1_ps(-0.11643287f); | |
| __m256 a9 = _mm256_set1_ps( 0.05265332f); | |
| __m256 a11 = _mm256_set1_ps(-0.01172120f); | |
| __m256 x_sq = _mm256_mul_ps(x, x); | |
| __m256 result; | |
| result = a11; | |
| result = _mm256_fmadd_ps(x_sq, result, a9); | |
| result = _mm256_fmadd_ps(x_sq, result, a7); | |
| result = _mm256_fmadd_ps(x_sq, result, a5); | |
| result = _mm256_fmadd_ps(x_sq, result, a3); | |
| result = _mm256_fmadd_ps(x_sq, result, a1); | |
| result = _mm256_mul_ps(x, result); | |
| return result; | |
| } | |
| // First manual version: straightfoward translation of atan2_auto_5 | |
| NOINLINE | |
| static void atan2_manual_1(size_t num_points, const float* ys, const float* xs, float* out) { | |
| // Check that the input plays well with AVX | |
| assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out); | |
| // Store pi and pi/2 as constants | |
| const __m256 pi = _mm256_set1_ps(M_PI); | |
| const __m256 pi_2 = _mm256_set1_ps(M_PI_2); | |
| // Create bit masks that we will need. | |
| // The first one is all 1s except from the sign bit: | |
| // | |
| // 01111111111111111111111111111111 | |
| // | |
| // We can use it to make a float absolute by AND'ing with it. | |
| const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));; | |
| // The second is only the sign bit: | |
| // | |
| // 10000000000000000000000000000000 | |
| // | |
| // we can use it to extract the sign of a number by AND'ing with it. | |
| const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
| for (size_t i = 0; i < num_points; i += 8) { | |
| // Load 8 elements from `ys` and `xs` into two vectors. | |
| __m256 y = _mm256_load_ps(&ys[i]); | |
| __m256 x = _mm256_load_ps(&xs[i]); | |
| // Compare |y| > |x|. This will create an 8-vector of floats that we can | |
| // use as a mask: the elements where the respective comparison is true will be filled | |
| // with 1s, with 0s where the comparison is false. | |
| // | |
| // For example, if we were comparing the vectors | |
| // | |
| // 5 -5 5 -5 5 -5 5 -5 | |
| // > | |
| // -5 5 -5 5 -5 5 -5 5 | |
| // = | |
| // 1s 0s 1s 0s 1s 0s 1s 0s | |
| // | |
| // Where `1s = 0xFFFFFFFF` and `0s = 0x00000000`. | |
| __m256 swap_mask = _mm256_cmp_ps( | |
| _mm256_and_ps(y, abs_mask), // |y| | |
| _mm256_and_ps(x, abs_mask), // |x| | |
| _CMP_GT_OS | |
| ); | |
| // Create the atan input by "blending" `y` and `x`, according to the mask computed | |
| // above. The blend instruction will pick the first or second argument based on | |
| // the mask we passed in. In our case we need the number of larger magnitude to | |
| // be the denominator. | |
| __m256 atan_input = _mm256_div_ps( | |
| _mm256_blendv_ps(y, x, swap_mask), // pick the lowest between |y| and |x| for each number | |
| _mm256_blendv_ps(x, y, swap_mask) // and the highest. | |
| ); | |
| // Approximate atan | |
| __m256 result = atan_avx_approximation(atan_input); | |
| // If swapped, adjust atan output. We use blending again to leave | |
| // the output unchanged if we didn't swap anything. | |
| // | |
| // If we need to adjust it, we simply carry the sign over from the input | |
| // to `pi_2` by using the `sign_mask`. This avoids a more expensive comparison, | |
| // and also handles edge cases such as -0 better. | |
| result = _mm256_blendv_ps( | |
| result, | |
| _mm256_sub_ps( | |
| _mm256_or_ps(pi_2, _mm256_and_ps(atan_input, sign_mask)), | |
| result | |
| ), | |
| swap_mask | |
| ); | |
| // Adjust the result depending on the input quadrant. | |
| // | |
| // We create a mask for the sign of `x` using an arithmetic right shift: | |
| // the mask will be all 0s if the sign if positive, and all 1s | |
| // if the sign is negative. | |
| __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31)); | |
| // Then use the mask to perform the adjustment only when the sign | |
| // if positive, and use the sign bit of `y` to know whether to add | |
| // `pi` or `-pi`. | |
| result = _mm256_add_ps( | |
| _mm256_and_ps( | |
| _mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), | |
| x_sign_mask | |
| ), | |
| result | |
| ); | |
| // Store result | |
| _mm256_store_ps(&out[i], result); | |
| } | |
| } | |
| // Second manual version: use the abs values we get at the beginning | |
| // for more ILP (see comment below) | |
| NOINLINE | |
| static void atan2_manual_2(size_t num_points, const float* ys, const float* xs, float* out) { | |
| assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out); | |
| const __m256 pi = _mm256_set1_ps(M_PI); | |
| const __m256 pi_2 = _mm256_set1_ps(M_PI_2); | |
| // no sign bit -- AND with this to get absolute value | |
| const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));; | |
| // only sign bit -- XOR with this to get negative | |
| const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
| for (size_t i = 0; i < num_points; i += 8) { | |
| __m256 y = _mm256_load_ps(&ys[i]); | |
| __m256 x = _mm256_load_ps(&xs[i]); | |
| __m256 abs_y = _mm256_and_ps(abs_mask, y); | |
| __m256 abs_x = _mm256_and_ps(abs_mask, x); | |
| // atan_input = min(|y|, |x|) / max(|y|, |x|) | |
| // | |
| // I've experimented with using blend rather than min, given that we | |
| // compute `swap_mask` later anyway, but that delays the atan | |
| // computation by 2+ cycles, and is less parallel, since on skylake: | |
| // | |
| // latency throughput | |
| // vminps 4 0.5 | |
| // vmaxps 4 0.5 | |
| // vblendvps 2 0.66 | |
| // vcmpps 4 0.5 | |
| // | |
| // So while it decreases the numbers of instructions needed it makes | |
| // the overall function slower. | |
| // | |
| // This has the unfortunate effect of silently dropping NaNs in the | |
| // inputs, since in case of NaNs these instructions just return | |
| // the second argument. | |
| __m256 atan_input = _mm256_div_ps( | |
| _mm256_min_ps(abs_x, abs_y), _mm256_max_ps(abs_x, abs_y) | |
| ); | |
| // Approximate atan | |
| __m256 result = atan_avx_approximation(atan_input); | |
| // We first do the usual +- pi - res, but in this case | |
| // we know the sign of pi since the input is always positive. | |
| // | |
| // result = (abs_y > abs_x) ? pi - res : res; | |
| __m256 swap_mask = _mm256_cmp_ps(abs_y, abs_x, _CMP_GT_OQ); | |
| result = _mm256_add_ps( | |
| _mm256_xor_ps(result, _mm256_and_ps(sign_mask, swap_mask)), | |
| _mm256_and_ps(pi_2, swap_mask) | |
| ); | |
| // We now have to adjust the quadrant slightly differently: | |
| // | |
| // * If both values are positive, do nothing; | |
| // * If y is negative and x is positive, do nothing; | |
| // * If y is positive and x is negative, do pi + result; | |
| // * If y is negative and x is negative, do result - pi; | |
| // | |
| // These can easily be verified by analyzing what happens to the output | |
| // for each input quadrant. | |
| // | |
| // These cases can be compressed to the two branches below, and then | |
| // made branchless without using any blend instruction. | |
| // result = (x < 0) ? pi - res : res; | |
| __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31)); | |
| result = _mm256_add_ps( | |
| _mm256_xor_ps(result, _mm256_and_ps(x, sign_mask)), | |
| _mm256_and_ps(pi, x_sign_mask) | |
| ); | |
| // result = (y < 0) ? -res : res; | |
| result = _mm256_xor_ps(result, _mm256_and_ps(y, sign_mask)); | |
| // Store result | |
| _mm256_store_ps(&out[i], result); | |
| } | |
| } | |
| #endif | |
| // -------------------------------------------------------------------- | |
| // data generation | |
| NOINLINE | |
| static void generate_data( | |
| bool random_points, | |
| bool test_edge_cases, | |
| size_t desired_num_cases, | |
| size_t* cases, | |
| float** ys, | |
| float** xs, | |
| float** ref_out, | |
| float** out | |
| ) { | |
| cout << "Generating data..." << flush; | |
| size_t edge = static_cast<size_t>(sqrtf(static_cast<float>(desired_num_cases))); | |
| vector<float> extra_cases; | |
| if (test_edge_cases) { | |
| // We want to make sure to test some special cases, so we always add them. | |
| // We do not include negative zero, since we do not care about the sign | |
| // matching up in that case. | |
| // We also do not include NaN and inf, since we assume that the input does | |
| // not contain it. | |
| using limits = numeric_limits<float>; | |
| extra_cases = { | |
| 1.0f, -1.0f, 0.0f, -0.0f, | |
| limits::epsilon(), -limits::epsilon(), | |
| limits::max(), -limits::max(), | |
| limits::quiet_NaN() | |
| }; | |
| } | |
| // The -1 is for the 0, 0 case, which we do not want. | |
| *cases = (edge+extra_cases.size())*(edge+extra_cases.size()) - 1; | |
| // Make sure cases is a multiple of 8 so that the AVX functions can work cleanly | |
| *cases = *cases + (8 - (*cases % 8)); | |
| *ys = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
| *xs = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
| *ref_out = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
| *out = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
| if (*ys == nullptr || *xs == nullptr || *ref_out == nullptr || *out == nullptr) { | |
| cerr << "Could not allocate arrays" << endl; | |
| exit(EXIT_FAILURE); | |
| } | |
| mt19937_64 gen(0); | |
| { | |
| float bound = 1.0f; | |
| float step = (bound * 2.0f) / static_cast<float>(edge); | |
| uniform_real_distribution<float> dist{ -1.0f, 1.0f }; | |
| const auto get_number = [random_points, &extra_cases, &gen, bound, step, &dist](size_t i) -> float { | |
| if (i < extra_cases.size()) { | |
| return extra_cases.at(i); | |
| } else if (random_points) { | |
| return dist(gen); | |
| } else { | |
| return -bound + static_cast<float>(i - extra_cases.size()) * step; | |
| } | |
| }; | |
| size_t ix = 0; | |
| for (size_t i = 0; i < edge + extra_cases.size(); i++) { | |
| for (size_t j = 0; j < edge + extra_cases.size(); j++) { | |
| float y = get_number(i); | |
| float x = get_number(j); | |
| if (y == 0.0f && x == 0.0f) { continue; } | |
| (*ys)[ix] = y; | |
| (*xs)[ix] = x; | |
| ix++; | |
| } | |
| } | |
| // pad with dummies | |
| for (; ix < *cases; ix++) { | |
| (*ys)[ix] = 1.0f; | |
| (*xs)[ix] = 1.0f; | |
| } | |
| } | |
| // shuffle to confuse branch predictor in the case of explicit branches and | |
| // non-random points | |
| { | |
| for (size_t i = 0; i < *cases - 1; i++) { | |
| size_t swap_with = static_cast<size_t>(i + 1 + (gen() % (*cases - i - 1))); | |
| swap((*ys)[i], (*ys)[swap_with]); | |
| swap((*xs)[i], (*xs)[swap_with]); | |
| } | |
| } | |
| cout << " done." << endl; | |
| } | |
| // -------------------------------------------------------------------- | |
| // Pin to first CPU | |
| static void pin_to_cpu_0(void) { | |
| cpu_set_t cpu_mask; | |
| CPU_ZERO(&cpu_mask); | |
| CPU_SET(0, &cpu_mask); | |
| if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) { | |
| fprintf(stderr, "Could not set CPU affinity\n"); | |
| exit(EXIT_FAILURE); | |
| } | |
| } | |
| // -------------------------------------------------------------------- | |
| // perf instrumentation -- a mixture of man 3 perf_event_open and | |
| // <https://stackoverflow.com/a/42092180> | |
| static long | |
| perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { | |
| int ret; | |
| ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); | |
| return ret; | |
| } | |
| static void setup_perf_event( | |
| struct perf_event_attr *evt, | |
| int *fd, | |
| uint64_t *id, | |
| uint32_t evt_type, | |
| uint64_t evt_config, | |
| int group_fd | |
| ) { | |
| memset(evt, 0, sizeof(struct perf_event_attr)); | |
| evt->type = evt_type; | |
| evt->size = sizeof(struct perf_event_attr); | |
| evt->config = evt_config; | |
| evt->disabled = 1; | |
| evt->exclude_kernel = 1; | |
| evt->exclude_hv = 1; | |
| evt->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; | |
| *fd = perf_event_open(evt, 0, -1, group_fd, 0); | |
| if (*fd == -1) { | |
| fprintf(stderr, "Error opening leader %llx\n", evt->config); | |
| exit(EXIT_FAILURE); | |
| } | |
| ioctl(*fd, PERF_EVENT_IOC_ID, id); | |
| } | |
| static struct perf_event_attr perf_cycles_evt; | |
| static int perf_cycles_fd; | |
| static uint64_t perf_cycles_id; | |
| static struct perf_event_attr perf_clock_evt; | |
| static int perf_clock_fd; | |
| static uint64_t perf_clock_id; | |
| static struct perf_event_attr perf_instrs_evt; | |
| static int perf_instrs_fd; | |
| static uint64_t perf_instrs_id; | |
| static struct perf_event_attr perf_cache_misses_evt; | |
| static int perf_cache_misses_fd; | |
| static uint64_t perf_cache_misses_id; | |
| static struct perf_event_attr perf_cache_references_evt; | |
| static int perf_cache_references_fd; | |
| static uint64_t perf_cache_references_id; | |
| static struct perf_event_attr perf_branch_misses_evt; | |
| static int perf_branch_misses_fd; | |
| static uint64_t perf_branch_misses_id; | |
| static struct perf_event_attr perf_branch_instructions_evt; | |
| static int perf_branch_instructions_fd; | |
| static uint64_t perf_branch_instructions_id; | |
| static void perf_init(void) { | |
| // Cycles | |
| setup_perf_event( | |
| &perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id, | |
| PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, -1 | |
| ); | |
| // Clock | |
| setup_perf_event( | |
| &perf_clock_evt, &perf_clock_fd, &perf_clock_id, | |
| PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd | |
| ); | |
| // Instructions | |
| setup_perf_event( | |
| &perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id, | |
| PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd | |
| ); | |
| // Cache misses | |
| setup_perf_event( | |
| &perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id, | |
| PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd | |
| ); | |
| // Cache references | |
| setup_perf_event( | |
| &perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id, | |
| PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd | |
| ); | |
| // Branch misses | |
| setup_perf_event( | |
| &perf_branch_misses_evt, &perf_branch_misses_fd, &perf_branch_misses_id, | |
| PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, perf_cycles_fd | |
| ); | |
| // Branch instructions | |
| setup_perf_event( | |
| &perf_branch_instructions_evt, &perf_branch_instructions_fd, &perf_branch_instructions_id, | |
| PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, perf_cycles_fd | |
| ); | |
| } | |
| static void perf_close(void) { | |
| close(perf_clock_fd); | |
| close(perf_cycles_fd); | |
| close(perf_instrs_fd); | |
| close(perf_cache_misses_fd); | |
| close(perf_cache_references_fd); | |
| } | |
| static void disable_perf_count(void) { | |
| ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); | |
| } | |
| static void enable_perf_count(void) { | |
| ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); | |
| } | |
| static void reset_perf_count(void) { | |
| ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); | |
| } | |
| struct perf_read_value { | |
| uint64_t value; | |
| uint64_t id; | |
| }; | |
| struct perf_read_format { | |
| uint64_t nr; | |
| struct perf_read_value values[]; | |
| }; | |
| static char perf_read_buf[4096]; | |
| struct perf_count { | |
| uint64_t cycles; | |
| double seconds; | |
| uint64_t instructions; | |
| uint64_t cache_misses; | |
| uint64_t cache_references; | |
| uint64_t branch_misses; | |
| uint64_t branch_instructions; | |
| perf_count(): cycles(0), seconds(0.0), instructions(0), cache_misses(0), cache_references(0), branch_misses(0), branch_instructions(0) {} | |
| }; | |
| static void read_perf_count(struct perf_count *count) { | |
| if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) { | |
| fprintf(stderr, "Could not read cycles from perf\n"); | |
| exit(EXIT_FAILURE); | |
| } | |
| struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf; | |
| if (rf->nr != 7) { | |
| fprintf(stderr, "Bad number of perf events\n"); | |
| exit(EXIT_FAILURE); | |
| } | |
| for (int i = 0; i < static_cast<int>(rf->nr); i++) { | |
| struct perf_read_value *value = &rf->values[i]; | |
| if (value->id == perf_cycles_id) { | |
| count->cycles = value->value; | |
| } else if (value->id == perf_clock_id) { | |
| count->seconds = ((double) (value->value / 1000ull)) / 1000000.0; | |
| } else if (value->id == perf_instrs_id) { | |
| count->instructions = value->value; | |
| } else if (value->id == perf_cache_misses_id) { | |
| count->cache_misses = value->value; | |
| } else if (value->id == perf_cache_references_id) { | |
| count->cache_references = value->value; | |
| } else if (value->id == perf_branch_misses_id) { | |
| count->branch_misses = value->value; | |
| } else if (value->id == perf_branch_instructions_id) { | |
| count->branch_instructions = value->value; | |
| } else { | |
| fprintf(stderr, "Spurious value in perf read (%ld)\n", value->id); | |
| exit(EXIT_FAILURE); | |
| } | |
| } | |
| } | |
| // -------------------------------------------------------------------- | |
| // running the function | |
| struct max_error { | |
| float y; | |
| float x; | |
| float error; | |
| max_error(): y(0.0f), x(0.0f), error(0.0f) {} | |
| void update(float y, float x, float reference, float result) { | |
| if (isnan(reference) && !isnan(result)) { | |
| cerr << "Expected NaN in result, but got " << result << endl; | |
| cerr << "For point " << x << ", " << y << endl; | |
| exit(EXIT_FAILURE); | |
| } | |
| if (!isnan(reference) && isnan(result)) { | |
| cerr << "Unexpected NaN in result, expected " << reference << endl; | |
| cerr << "For point " << x << ", " << y << endl; | |
| exit(EXIT_FAILURE); | |
| } | |
| float error = abs(reference - result); | |
| if (error > this->error) { | |
| this->y = y; | |
| this->x = x; | |
| this->error = error; | |
| } | |
| } | |
| }; | |
| NOINLINE | |
| static void run_timed( | |
| int pre_iterations, | |
| int iterations, | |
| bool test_negative_zero, | |
| bool test_max, | |
| bool test_nan, | |
| size_t num, | |
| const string& name, | |
| void (*fun)(size_t, const float*, const float*, float*), | |
| const float* ref_out, // if null no comparison will be run | |
| const float* in_y, | |
| const float* in_x, | |
| float* out | |
| ) { | |
| if (iterations < 1) { | |
| fprintf(stderr, "iterations < 1: %d\n", iterations); | |
| exit(EXIT_FAILURE); | |
| } | |
| for (int i = 0; i < pre_iterations; i++) { | |
| fun(num, in_y, in_x, out); | |
| } | |
| struct perf_count counts; | |
| disable_perf_count(); | |
| reset_perf_count(); | |
| enable_perf_count(); | |
| // The enabling / disabling adds noise for small timings, so | |
| // wrap the loop since the loop counts for very little (one add, one cmp, one | |
| // predicted conditional jump). | |
| for (int i = 0; i < iterations; i++) { | |
| fun(num, in_y, in_x, out); | |
| } | |
| disable_perf_count(); | |
| read_perf_count(&counts); | |
| // two floats per element | |
| uint64_t bytes = ((uint64_t) num) * sizeof(float) * 2; | |
| double gb_per_s = (((double) bytes) / 1000000000.0) / (counts.seconds / ((double) iterations)); | |
| double time = counts.seconds; | |
| const char *unit = " s"; | |
| if (time < 0.1) { | |
| time *= 1000.0; | |
| unit = "ms"; | |
| } | |
| if (time < 0.1) { | |
| time *= 1000.0; | |
| unit = "us"; | |
| } | |
| constexpr int padded_name_size = 10; | |
| char padded_name[padded_name_size]; | |
| int name_len = snprintf(padded_name, padded_name_size, "%s:", name.c_str()); | |
| for (int i = name_len; i < padded_name_size; i++) { | |
| padded_name[i] = ' '; | |
| } | |
| padded_name[padded_name_size-1] = '\0'; | |
| printf( | |
| "%s %5.2f%s, %6.2fGB/s, %6.2f cycles/elem, %6.2f instrs/elem, %5.2f instrs/cycle, %5.2f branches/elem, %5.2f%% branch misses, %5.2f%% cache misses, %5.2fGHz", | |
| padded_name, time, unit, gb_per_s, | |
| ((double) counts.cycles) / ((double) iterations) / ((double) num), | |
| ((double) counts.instructions) / ((double) iterations) / ((double) num), | |
| ((double) counts.instructions) / ((double) counts.cycles), | |
| ((double) counts.branch_instructions) / ((double) iterations) / ((double) num), | |
| 100.0 * ((double) counts.branch_misses) / ((double) counts.branch_instructions), | |
| 100.0 * ((double) counts.cache_misses) / ((double) counts.cache_references), | |
| ((double) counts.cycles) / counts.seconds / 1000000000.0 | |
| ); | |
| if (ref_out != nullptr) { | |
| max_error max_error; | |
| for (size_t ix = 0; ix < num; ix++) { | |
| float y = in_y[ix]; | |
| float x = in_x[ix]; | |
| if (!test_negative_zero && (y == -0.0f || x == -0.0f)) { continue; } | |
| if ( | |
| !test_max && | |
| (fabs(y) == numeric_limits<float>::max() || fabs(x) == numeric_limits<float>::max()) | |
| ) { continue; } | |
| if (!test_nan && (isnan(y) || isnan(x))) { continue; } | |
| max_error.update(in_y[ix], in_x[ix], ref_out[ix], out[ix]); | |
| } | |
| printf( | |
| ", %.9fdeg max error, max error point: %+.9f, %+.9f\n", | |
| max_error.error * 180.0f / M_PI, max_error.x, max_error.y | |
| ); | |
| } else { | |
| printf("\n"); | |
| } | |
| } | |
| #define run_timed_atan2(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, fun, ref_out, in_y, in_x, out) \ | |
| run_timed(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, #fun, atan2_ ## fun, ref_out, in_y, in_x, out) | |
| // -------------------------------------------------------------------- | |
| // main | |
| NOINLINE | |
| static pair<string, size_t> format_size(size_t size) { | |
| string unit = "B"; | |
| if (size > 10000ull) { | |
| size /= 1000ull; | |
| unit = "kB"; | |
| } | |
| if (size > 10000ull) { | |
| size /= 1000ull; | |
| unit = "MB"; | |
| } | |
| if (size > 10000ull) { | |
| size /= 1000ull; | |
| unit = "GB"; | |
| } | |
| return { unit, size }; | |
| } | |
| struct config { | |
| bool random_points = false; | |
| bool test_edge_cases = false; | |
| bool test_baseline = true; | |
| bool test_auto_1 = true; | |
| bool test_auto_2 = true; | |
| bool test_auto_3 = true; | |
| bool test_auto_4 = true; | |
| bool test_manual_1 = true; | |
| bool test_manual_2 = true; | |
| }; | |
| int main(int argc, const char* argv[]) { | |
| cout.precision(numeric_limits<float>::max_digits10); | |
| pin_to_cpu_0(); | |
| perf_init(); | |
| config config; | |
| const auto bad_usage = [&argv]() { | |
| cerr << "Usage: " << argv[0] << " [--random-points] [--test-edge-cases] [--functions COMMA_SEPARATED_FUNCTIONS] NUMBER_OF_TEST_CASES NUMBER_OF_WARM_UPS NUMBER_OF_ITERATIONS" << endl; | |
| exit(EXIT_FAILURE); | |
| }; | |
| vector<string> arguments; | |
| for (int i = 1; i < argc; i++) { | |
| string arg{argv[i]}; | |
| if (arg == "--random-points") { | |
| config.random_points = true; | |
| } else if (arg == "--test-edge-cases") { | |
| config.test_edge_cases = true; | |
| } else if (arg == "--functions") { | |
| config.test_baseline = false; | |
| config.test_auto_1 = false; | |
| config.test_auto_2 = false; | |
| config.test_auto_3 = false; | |
| config.test_auto_4 = false; | |
| config.test_manual_1 = false; | |
| config.test_manual_2 = false; | |
| if (i < argc - 1) { | |
| i++; | |
| stringstream functions{ argv[i] }; | |
| for (string function; getline(functions, function, ','); ) { | |
| if (function == "baseline") { | |
| config.test_baseline = true; | |
| } else if (function == "auto_1") { | |
| config.test_auto_1 = true; | |
| } else if (function == "auto_2") { | |
| config.test_auto_2 = true; | |
| } else if (function == "auto_3") { | |
| config.test_auto_3 = true; | |
| } else if (function == "auto_4") { | |
| config.test_auto_4 = true; | |
| } | |
| #ifdef USE_AVX | |
| else if (function == "manual") { | |
| config.test_manual_1 = true; | |
| } else if (function == "manual_2") { | |
| config.test_manual_2 = true; | |
| } | |
| #endif | |
| else { | |
| cerr << "Bad function " << function << endl; | |
| bad_usage(); | |
| } | |
| } | |
| } else { | |
| cerr << "Expecting argument after --functions" << endl; | |
| bad_usage(); | |
| } | |
| } else { | |
| arguments.emplace_back(arg); | |
| } | |
| } | |
| if (arguments.size() != 3) { | |
| bad_usage(); | |
| } | |
| size_t desired_cases; | |
| if (sscanf(arguments.at(0).c_str(), "%zu", &desired_cases) != 1) { | |
| cerr << "Could not parse number of cases" << endl; | |
| bad_usage(); | |
| } | |
| int pre_iterations; | |
| if (sscanf(arguments.at(1).c_str(), "%d", &pre_iterations) != 1) { | |
| cerr << "Could not parse warm ups" << endl; | |
| bad_usage(); | |
| } | |
| int iterations; | |
| if (sscanf(arguments.at(2).c_str(), "%d", &iterations) != 1) { | |
| cerr << "Could not parse iterations" << endl; | |
| bad_usage(); | |
| } | |
| size_t cases; | |
| float* ys; | |
| float* xs; | |
| float* ref_out; | |
| float* out; | |
| generate_data( | |
| config.random_points, config.test_edge_cases, | |
| desired_cases, | |
| &cases, &ys, &xs, | |
| &ref_out, | |
| &out | |
| ); | |
| if (!config.test_baseline) { | |
| free(ref_out); | |
| ref_out = nullptr; | |
| } | |
| const auto formatted_input_size = format_size(sizeof(float) * 2 * cases); | |
| const auto formatted_output_size = format_size(sizeof(float) * cases); | |
| cout << "Tests will read " << formatted_input_size.second << formatted_input_size.first << " and write " << formatted_output_size.second << formatted_output_size.first << " (" << cases << " points)" << endl; | |
| cout << "Running " << pre_iterations << " warm ups and " << iterations << " iterations" << endl; | |
| cout << endl; | |
| if (config.test_baseline) { | |
| run_timed(pre_iterations, iterations, true, true, true, cases, "baseline", atan2_baseline, nullptr, ys, xs, ref_out); | |
| } | |
| // auto_1 to auto_3 do not support the max value because the input gets | |
| // reduce to negative zero in that case and the atan_input >= 0 branch | |
| // doesn't preserve the sign properly. | |
| // | |
| // Similarly, none of the functions apart from manual_1 and manual_2 support | |
| // negative zero properly because of the x < 0 branch. | |
| // | |
| // manual_2 silently drop NaNs, it can be made to not do that at slight | |
| // performance hit (see comment in it). | |
| if (config.test_auto_1) { | |
| run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_1, ref_out, ys, xs, out); | |
| } | |
| if (config.test_auto_2) { | |
| run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_2, ref_out, ys, xs, out); | |
| } | |
| if (config.test_auto_3) { | |
| run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_3, ref_out, ys, xs, out); | |
| } | |
| if (config.test_auto_4) { | |
| run_timed_atan2(pre_iterations, iterations, false, true, true, cases, auto_4, ref_out, ys, xs, out); | |
| } | |
| #ifdef USE_AVX | |
| if (config.test_manual_1) { | |
| run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_1, ref_out, ys, xs, out); | |
| } | |
| if (config.test_manual_2) { | |
| run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_2, ref_out, ys, xs, out); | |
| } | |
| #endif | |
| cout << endl; | |
| free(ys); free(xs); free(ref_out); free(out); | |
| perf_close(); | |
| return EXIT_SUCCESS; | |
| } |
@bitonic , Hi I am using ubuntu 20.04 and I built your vectorized-atan2f.cpp with clang++-12 and run ./vectorized-atan2f but always got the below error:
Error opening leader 0.
Don't know how to resolve this :(
I have installed perf tool in my ubuntu.
Hey,
for the case y < 0, x = 0 atan2_auto_3 returns Pi/2, but the result should be -Pi/2.
A fun problem I ran into using code derived from atan2_auto_4(): it is wrong when x is negative zero. Say if y=1 and x=-0, then atan_input is -0 and atan_fma_approximation() returns -0. The quadrant adjustment on line 260 propagates the sign from the -0 giving res=-pi/2, and then the condition x < 0.0f is false so -pi/2 is the final result. This is a major discontinuity since the correct result is pi/2. To fix it, I believe you can change line 262 from:
if (x < 0.0f) {to
if (signbit(x)) {
I figured out why I was getting no perf events.
From the manual on perf_event_open(2)
Turns out there was one too many hardware counters for my CPU so none of them returned values, but you also don't get any errors. (Aside: though I was getting a bogus value out of the software task-clock counter, which is weird)
Got a similar result from
perfbut it correctly counts as many as it can and then says<not counted>And after the bit about
nmi_watchdog, I did let me get results from all the counters.I'll have to look later why/how
perfdetects that there are too many counters and/or detecting the number of available ones beforehand.