pendingchaos · January 29, 2016 15:19 · Jan 29, 2016 · Jan 29, 2016 · Jan 27, 2016
diff --git a/aabb_tranform.c b/aabb_tranform.c
@@ -1,4 +1,11 @@
-//TODO: Timings
+//-O0: SSE is ~6.3x faster
+//-O1: SSE is ~4.6x faster
+//-O2: SSE is ~2.5x faster
+//-O3: SSE is ~2.2x faster
+//-Og: SSE is ~5.5x faster
+//-Ofast: SSE is ~1.7x faster
+//-Os: SSE is ~2.3x faster
+//You probably should not trust these timings.
 //Compiled with GCC 5.3.1
 //Ran on a Intel(R) Core(TM) i7-3770K CPU
 //Matrices are in column-major layout (or OpenGL or non-Direct3D layout).

diff --git a/aabb_tranform.c b/aabb_tranform.c
@@ -1,43 +1,24 @@
-//-O0: SSE is ~8.2x faster
-//-O1: SSE is ~11.6x faster
-//-O2: SSE is ~23.3x faster
-//-O3: SSE is ~5x faster
-//-Ofast: SSE is ~1.1x faster
+//TODO: Timings
 //Compiled with GCC 5.3.1
 //Ran on a Intel(R) Core(TM) i7-3770K CPU
-//These results are probably wrong.
 //Matrices are in column-major layout (or OpenGL or non-Direct3D layout).
 #include <xmmintrin.h>
 #include <math.h>
 
 void transformAABB_SSE(const float matrix_[4][4], float min[4], float max[4]) {
-    min[3] = max[3] = NAN;
-    __m128 tmin __attribute__((aligned(16))) = _mm_loadu_ps(min);
-    __m128 tmax __attribute__((aligned(16))) = _mm_loadu_ps(max);
-    __m128 rmin __attribute__((aligned(16))) = _mm_loadu_ps(((const float*)&matrix_)+12);
-    __m128 rmax __attribute__((aligned(16))) = rmin;
+    __m128 rmin = _mm_loadu_ps((const float*)matrix_+12);
+    __m128 rmax = rmin;
 
-    __m128 matrix[4] __attribute__((aligned(16)));
-    matrix[0] = _mm_loadu_ps((const float*)&matrix_);
-    matrix[1] = _mm_loadu_ps(((const float*)&matrix_)+4);
-    matrix[2] = _mm_loadu_ps(((const float*)&matrix_)+8);
-    matrix[3] = _mm_loadu_ps(((const float*)&matrix_)+12);
-    _MM_TRANSPOSE4_PS(matrix[0], matrix[1], matrix[2], matrix[3]);
-
-    __m128 x __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmin);
-    __m128 y __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmax);
-    rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
-    rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
-
-    x = _mm_mul_ps(matrix[1], tmin);
-    y = _mm_mul_ps(matrix[1], tmax);
-    rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
-    rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
-
-    x = _mm_mul_ps(matrix[2], tmin);
-    y = _mm_mul_ps(matrix[2], tmax);
-    rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
-    rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
+    #define ITER(j) {\
+        __m128 x = _mm_mul_ps(_mm_set1_ps(min[j]), _mm_loadu_ps((const float*)&matrix_[j]));\
+        __m128 y = _mm_mul_ps(_mm_set1_ps(max[j]), _mm_loadu_ps((const float*)&matrix_[j]));\
+        rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));\
+        rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));\
+    }
+    ITER(0)
+    ITER(1)
+    ITER(2)
+    #undef ITER
 
     _mm_store_ps(min, rmin);
     _mm_store_ps(max, rmax);

diff --git a/aabb_tranform.c b/aabb_tranform.c
@@ -0,0 +1,83 @@
+//-O0: SSE is ~8.2x faster
+//-O1: SSE is ~11.6x faster
+//-O2: SSE is ~23.3x faster
+//-O3: SSE is ~5x faster
+//-Ofast: SSE is ~1.1x faster
+//Compiled with GCC 5.3.1
+//Ran on a Intel(R) Core(TM) i7-3770K CPU
+//These results are probably wrong.
+//Matrices are in column-major layout (or OpenGL or non-Direct3D layout).
+#include <xmmintrin.h>
+#include <math.h>
+
+void transformAABB_SSE(const float matrix_[4][4], float min[4], float max[4]) {
+    min[3] = max[3] = NAN;
+    __m128 tmin __attribute__((aligned(16))) = _mm_loadu_ps(min);
+    __m128 tmax __attribute__((aligned(16))) = _mm_loadu_ps(max);
+    __m128 rmin __attribute__((aligned(16))) = _mm_loadu_ps(((const float*)&matrix_)+12);
+    __m128 rmax __attribute__((aligned(16))) = rmin;
+
+    __m128 matrix[4] __attribute__((aligned(16)));
+    matrix[0] = _mm_loadu_ps((const float*)&matrix_);
+    matrix[1] = _mm_loadu_ps(((const float*)&matrix_)+4);
+    matrix[2] = _mm_loadu_ps(((const float*)&matrix_)+8);
+    matrix[3] = _mm_loadu_ps(((const float*)&matrix_)+12);
+    _MM_TRANSPOSE4_PS(matrix[0], matrix[1], matrix[2], matrix[3]);
+
+    __m128 x __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmin);
+    __m128 y __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmax);
+    rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
+    rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
+
+    x = _mm_mul_ps(matrix[1], tmin);
+    y = _mm_mul_ps(matrix[1], tmax);
+    rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
+    rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
+
+    x = _mm_mul_ps(matrix[2], tmin);
+    y = _mm_mul_ps(matrix[2], tmax);
+    rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
+    rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
+
+    _mm_store_ps(min, rmin);
+    _mm_store_ps(max, rmax);
+}
+
+void transformAABB(const float matrix[4][4], float min[4], float max[4]) {
+    float rmin[3] = {matrix[3][0], matrix[3][1], matrix[3][2]};
+    float rmax[3] = {matrix[3][0], matrix[3][1], matrix[3][2]};
+    #define ITER(i, j) {\
+        float x = min[j] * matrix[j][i];\
+        float y = max[j] * matrix[j][i];\
+        rmin[i] += fmin(x, y);\
+        rmax[i] += fmax(x, y);\
+    }
+    ITER(0, 0)
+    ITER(0, 1)
+    ITER(0, 2)
+    ITER(1, 0)
+    ITER(1, 1)
+    ITER(1, 2)
+    ITER(2, 0)
+    ITER(2, 1)
+    ITER(2, 2)
+    #undef ITER
+    min[0] = rmin[0];
+    min[1] = rmin[1];
+    min[2] = rmin[2];
+    max[0] = rmax[0];
+    max[1] = rmax[1];
+    max[2] = rmax[2];
+}
+
+int main(int argc) {
+    const float matrix[4][4];
+    float min[4];
+    float max[4];
+
+    for (size_t i = 0; i < 10000000; i++)
+        transformAABB_SSE(matrix, min, max);
+
+    //Stop the compiler from optimizing stuff away.
+    return min[0] + min[1] + min[2] + max[0] + max[1] + max[2];
+}