Last active
January 29, 2016 15:19
-
-
Save pendingchaos/6b6f1e43040e55fce5cd to your computer and use it in GitHub Desktop.
Revisions
-
pendingchaos revised this gist
Jan 29, 2016 . 1 changed file with 8 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,11 @@ //-O0: SSE is ~6.3x faster //-O1: SSE is ~4.6x faster //-O2: SSE is ~2.5x faster //-O3: SSE is ~2.2x faster //-Og: SSE is ~5.5x faster //-Ofast: SSE is ~1.7x faster //-Os: SSE is ~2.3x faster //You probably should not trust these timings. //Compiled with GCC 5.3.1 //Ran on a Intel(R) Core(TM) i7-3770K CPU //Matrices are in column-major layout (or OpenGL or non-Direct3D layout). -
pendingchaos revised this gist
Jan 29, 2016 . 1 changed file with 13 additions and 32 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,43 +1,24 @@ //TODO: Timings //Compiled with GCC 5.3.1 //Ran on a Intel(R) Core(TM) i7-3770K CPU //Matrices are in column-major layout (or OpenGL or non-Direct3D layout). #include <xmmintrin.h> #include <math.h> void transformAABB_SSE(const float matrix_[4][4], float min[4], float max[4]) { __m128 rmin = _mm_loadu_ps((const float*)matrix_+12); __m128 rmax = rmin; #define ITER(j) {\ __m128 x = _mm_mul_ps(_mm_set1_ps(min[j]), _mm_loadu_ps((const float*)&matrix_[j]));\ __m128 y = _mm_mul_ps(_mm_set1_ps(max[j]), _mm_loadu_ps((const float*)&matrix_[j]));\ rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));\ rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));\ } ITER(0) ITER(1) ITER(2) #undef ITER _mm_store_ps(min, rmin); _mm_store_ps(max, rmax); -
pendingchaos created this gist
Jan 27, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,83 @@ //-O0: SSE is ~8.2x faster //-O1: SSE is ~11.6x faster //-O2: SSE is ~23.3x faster //-O3: SSE is ~5x faster //-Ofast: SSE is ~1.1x faster //Compiled with GCC 5.3.1 //Ran on a Intel(R) Core(TM) i7-3770K CPU //These results are probably wrong. //Matrices are in column-major layout (or OpenGL or non-Direct3D layout). #include <xmmintrin.h> #include <math.h> void transformAABB_SSE(const float matrix_[4][4], float min[4], float max[4]) { min[3] = max[3] = NAN; __m128 tmin __attribute__((aligned(16))) = _mm_loadu_ps(min); __m128 tmax __attribute__((aligned(16))) = _mm_loadu_ps(max); __m128 rmin __attribute__((aligned(16))) = _mm_loadu_ps(((const float*)&matrix_)+12); __m128 rmax __attribute__((aligned(16))) = rmin; __m128 matrix[4] __attribute__((aligned(16))); matrix[0] = _mm_loadu_ps((const float*)&matrix_); matrix[1] = _mm_loadu_ps(((const float*)&matrix_)+4); matrix[2] = _mm_loadu_ps(((const float*)&matrix_)+8); matrix[3] = _mm_loadu_ps(((const float*)&matrix_)+12); _MM_TRANSPOSE4_PS(matrix[0], matrix[1], matrix[2], matrix[3]); __m128 x __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmin); __m128 y __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmax); rmin = _mm_add_ps(rmin, _mm_min_ps(x, y)); rmax = _mm_add_ps(rmax, _mm_max_ps(x, y)); x = _mm_mul_ps(matrix[1], tmin); y = _mm_mul_ps(matrix[1], tmax); rmin = _mm_add_ps(rmin, _mm_min_ps(x, y)); rmax = _mm_add_ps(rmax, _mm_max_ps(x, y)); x = _mm_mul_ps(matrix[2], tmin); y = _mm_mul_ps(matrix[2], tmax); rmin = _mm_add_ps(rmin, _mm_min_ps(x, y)); rmax = _mm_add_ps(rmax, _mm_max_ps(x, y)); _mm_store_ps(min, rmin); _mm_store_ps(max, rmax); } void transformAABB(const float matrix[4][4], float min[4], float max[4]) { float rmin[3] = {matrix[3][0], matrix[3][1], matrix[3][2]}; float rmax[3] = {matrix[3][0], matrix[3][1], matrix[3][2]}; #define ITER(i, j) {\ float x = min[j] * matrix[j][i];\ float y = max[j] * matrix[j][i];\ rmin[i] += fmin(x, y);\ rmax[i] += fmax(x, y);\ } ITER(0, 0) ITER(0, 1) ITER(0, 2) ITER(1, 0) ITER(1, 1) ITER(1, 2) ITER(2, 0) ITER(2, 1) ITER(2, 2) #undef ITER min[0] = rmin[0]; min[1] = rmin[1]; min[2] = rmin[2]; max[0] = rmax[0]; max[1] = rmax[1]; max[2] = rmax[2]; } int main(int argc) { const float matrix[4][4]; float min[4]; float max[4]; for (size_t i = 0; i < 10000000; i++) transformAABB_SSE(matrix, min, max); //Stop the compiler from optimizing stuff away. return min[0] + min[1] + min[2] + max[0] + max[1] + max[2]; }