pendingchaos · September 1, 2025 09:56 · May 15, 2016 · Apr 23, 2016 · Oct 25, 2015 · Oct 25, 2015
diff --git a/matrix_mul.cpp b/matrix_mul.cpp
@@ -1,4 +1,4 @@
-#include <mmintrin.h>
+#include <xmmintrin.h>
 
 //~2.4x faster than non-SSE unrolled version.
 //Uses row-major order (D3D or non-OpenGL layout).
@@ -29,8 +29,8 @@ void mul(float result[4][4], float a[4][4], float b[4][4])
     newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2])));
     newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3])));
 
-    _mm_store_ps(result[0], newRow0);
-    _mm_store_ps(result[1], newRow1);
-    _mm_store_ps(result[2], newRow2);
-    _mm_store_ps(result[3], newRow3);
+    _mm_storeu_ps(result[0], newRow0);
+    _mm_storeu_ps(result[1], newRow1);
+    _mm_storeu_ps(result[2], newRow2);
+    _mm_storeu_ps(result[3], newRow3);
 }
diff --git a/matrix_mul.cpp b/matrix_mul.cpp
@@ -4,27 +4,27 @@
 //Uses row-major order (D3D or non-OpenGL layout).
 void mul(float result[4][4], float a[4][4], float b[4][4])
 {
-    __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]);
-    __m128 otherRow1 __attribute__((aligned(16))) = _mm_loadu_ps(b[1]);
-    __m128 otherRow2 __attribute__((aligned(16))) = _mm_loadu_ps(b[2]);
-    __m128 otherRow3 __attribute__((aligned(16))) = _mm_loadu_ps(b[3]);
+    __m128 otherRow0 = _mm_loadu_ps(b[0]);
+    __m128 otherRow1 = _mm_loadu_ps(b[1]);
+    __m128 otherRow2 = _mm_loadu_ps(b[2]);
+    __m128 otherRow3 = _mm_loadu_ps(b[3]);
 
-    __m128 newRow0 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0]));
+    __m128 newRow0 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0]));
     newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1])));
     newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2])));
     newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3])));
 
-    __m128 newRow1 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0]));
+    __m128 newRow1 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0]));
     newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1])));
     newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2])));
     newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3])));
 
-    __m128 newRow2 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0]));
+    __m128 newRow2 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0]));
     newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1])));
     newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2])));
     newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3])));
 
-    __m128 newRow3 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0]));
+    __m128 newRow3 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0]));
     newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1])));
     newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2])));
     newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3])));

diff --git a/matrix_mul.cpp b/matrix_mul.cpp
@@ -1,4 +1,4 @@
-#include <smmintrin.h>
+#include <mmintrin.h>
 
 //~2.4x faster than non-SSE unrolled version.
 //Uses row-major order (D3D or non-OpenGL layout).

diff --git a/matrix_mul.cpp b/matrix_mul.cpp
@@ -1,6 +1,7 @@
 #include <smmintrin.h>
 
 //~2.4x faster than non-SSE unrolled version.
+//Uses row-major order (D3D or non-OpenGL layout).
 void mul(float result[4][4], float a[4][4], float b[4][4])
 {
     __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]);

diff --git a/gistfile1.txt → matrix_mul.cpp b/gistfile1.txt → matrix_mul.cpp
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -1,3 +1,5 @@
+#include <smmintrin.h>
+
 //~2.4x faster than non-SSE unrolled version.
 void mul(float result[4][4], float a[4][4], float b[4][4])
 {

diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,33 @@
+//~2.4x faster than non-SSE unrolled version.
+void mul(float result[4][4], float a[4][4], float b[4][4])
+{
+    __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]);
+    __m128 otherRow1 __attribute__((aligned(16))) = _mm_loadu_ps(b[1]);
+    __m128 otherRow2 __attribute__((aligned(16))) = _mm_loadu_ps(b[2]);
+    __m128 otherRow3 __attribute__((aligned(16))) = _mm_loadu_ps(b[3]);
+
+    __m128 newRow0 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0]));
+    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1])));
+    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2])));
+    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3])));
+
+    __m128 newRow1 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0]));
+    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1])));
+    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2])));
+    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3])));
+
+    __m128 newRow2 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0]));
+    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1])));
+    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2])));
+    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3])));
+
+    __m128 newRow3 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0]));
+    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1])));
+    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2])));
+    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3])));
+
+    _mm_store_ps(result[0], newRow0);
+    _mm_store_ps(result[1], newRow1);
+    _mm_store_ps(result[2], newRow2);
+    _mm_store_ps(result[3], newRow3);
+}