Skip to content

Instantly share code, notes, and snippets.

@pendingchaos
Last active January 29, 2016 15:19
Show Gist options
  • Select an option

  • Save pendingchaos/6b6f1e43040e55fce5cd to your computer and use it in GitHub Desktop.

Select an option

Save pendingchaos/6b6f1e43040e55fce5cd to your computer and use it in GitHub Desktop.
AABB transformation
//-O0: SSE is ~8.2x faster
//-O1: SSE is ~11.6x faster
//-O2: SSE is ~23.3x faster
//-O3: SSE is ~5x faster
//-Ofast: SSE is ~1.1x faster
//Compiled with GCC 5.3.1
//Ran on a Intel(R) Core(TM) i7-3770K CPU
//These results are probably wrong.
//Matrices are in column-major layout (or OpenGL or non-Direct3D layout).
#include <xmmintrin.h>
#include <math.h>
void transformAABB_SSE(const float matrix_[4][4], float min[4], float max[4]) {
min[3] = max[3] = NAN;
__m128 tmin __attribute__((aligned(16))) = _mm_loadu_ps(min);
__m128 tmax __attribute__((aligned(16))) = _mm_loadu_ps(max);
__m128 rmin __attribute__((aligned(16))) = _mm_loadu_ps(((const float*)&matrix_)+12);
__m128 rmax __attribute__((aligned(16))) = rmin;
__m128 matrix[4] __attribute__((aligned(16)));
matrix[0] = _mm_loadu_ps((const float*)&matrix_);
matrix[1] = _mm_loadu_ps(((const float*)&matrix_)+4);
matrix[2] = _mm_loadu_ps(((const float*)&matrix_)+8);
matrix[3] = _mm_loadu_ps(((const float*)&matrix_)+12);
_MM_TRANSPOSE4_PS(matrix[0], matrix[1], matrix[2], matrix[3]);
__m128 x __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmin);
__m128 y __attribute__((aligned(16))) = _mm_mul_ps(matrix[0], tmax);
rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
x = _mm_mul_ps(matrix[1], tmin);
y = _mm_mul_ps(matrix[1], tmax);
rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
x = _mm_mul_ps(matrix[2], tmin);
y = _mm_mul_ps(matrix[2], tmax);
rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));
rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));
_mm_store_ps(min, rmin);
_mm_store_ps(max, rmax);
}
void transformAABB(const float matrix[4][4], float min[4], float max[4]) {
float rmin[3] = {matrix[3][0], matrix[3][1], matrix[3][2]};
float rmax[3] = {matrix[3][0], matrix[3][1], matrix[3][2]};
#define ITER(i, j) {\
float x = min[j] * matrix[j][i];\
float y = max[j] * matrix[j][i];\
rmin[i] += fmin(x, y);\
rmax[i] += fmax(x, y);\
}
ITER(0, 0)
ITER(0, 1)
ITER(0, 2)
ITER(1, 0)
ITER(1, 1)
ITER(1, 2)
ITER(2, 0)
ITER(2, 1)
ITER(2, 2)
#undef ITER
min[0] = rmin[0];
min[1] = rmin[1];
min[2] = rmin[2];
max[0] = rmax[0];
max[1] = rmax[1];
max[2] = rmax[2];
}
int main(int argc) {
const float matrix[4][4];
float min[4];
float max[4];
for (size_t i = 0; i < 10000000; i++)
transformAABB_SSE(matrix, min, max);
//Stop the compiler from optimizing stuff away.
return min[0] + min[1] + min[2] + max[0] + max[1] + max[2];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment