#include //~2.4x faster than non-SSE unrolled version. //Uses row-major order (D3D or non-OpenGL layout). void mul(float result[4][4], float a[4][4], float b[4][4]) { __m128 otherRow0 = _mm_loadu_ps(b[0]); __m128 otherRow1 = _mm_loadu_ps(b[1]); __m128 otherRow2 = _mm_loadu_ps(b[2]); __m128 otherRow3 = _mm_loadu_ps(b[3]); __m128 newRow0 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0])); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1]))); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2]))); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3]))); __m128 newRow1 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0])); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1]))); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2]))); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3]))); __m128 newRow2 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0])); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1]))); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2]))); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3]))); __m128 newRow3 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0])); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1]))); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2]))); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3]))); _mm_storeu_ps(result[0], newRow0); _mm_storeu_ps(result[1], newRow1); _mm_storeu_ps(result[2], newRow2); _mm_storeu_ps(result[3], newRow3); }