Last active
September 1, 2025 09:56
-
-
Save pendingchaos/0fe82d6d264cb68cb5f4 to your computer and use it in GitHub Desktop.
Revisions
-
pendingchaos revised this gist
May 15, 2016 . 1 changed file with 5 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ #include <xmmintrin.h> //~2.4x faster than non-SSE unrolled version. //Uses row-major order (D3D or non-OpenGL layout). @@ -29,8 +29,8 @@ void mul(float result[4][4], float a[4][4], float b[4][4]) newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2]))); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3]))); _mm_storeu_ps(result[0], newRow0); _mm_storeu_ps(result[1], newRow1); _mm_storeu_ps(result[2], newRow2); _mm_storeu_ps(result[3], newRow3); } -
pendingchaos revised this gist
Apr 23, 2016 . 1 changed file with 8 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,27 +4,27 @@ //Uses row-major order (D3D or non-OpenGL layout). void mul(float result[4][4], float a[4][4], float b[4][4]) { __m128 otherRow0 = _mm_loadu_ps(b[0]); __m128 otherRow1 = _mm_loadu_ps(b[1]); __m128 otherRow2 = _mm_loadu_ps(b[2]); __m128 otherRow3 = _mm_loadu_ps(b[3]); __m128 newRow0 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0])); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1]))); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2]))); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3]))); __m128 newRow1 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0])); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1]))); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2]))); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3]))); __m128 newRow2 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0])); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1]))); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2]))); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3]))); __m128 newRow3 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0])); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1]))); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2]))); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3]))); -
pendingchaos revised this gist
Oct 25, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ #include <mmintrin.h> //~2.4x faster than non-SSE unrolled version. //Uses row-major order (D3D or non-OpenGL layout). -
pendingchaos revised this gist
Oct 25, 2015 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,7 @@ #include <smmintrin.h> //~2.4x faster than non-SSE unrolled version. //Uses row-major order (D3D or non-OpenGL layout). void mul(float result[4][4], float a[4][4], float b[4][4]) { __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]); -
pendingchaos renamed this gist
Oct 25, 2015 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
pendingchaos revised this gist
Oct 25, 2015 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,5 @@ #include <smmintrin.h> //~2.4x faster than non-SSE unrolled version. void mul(float result[4][4], float a[4][4], float b[4][4]) { -
pendingchaos revised this gist
Oct 25, 2015 . No changes.There are no files selected for viewing
-
pendingchaos created this gist
Oct 25, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,33 @@ //~2.4x faster than non-SSE unrolled version. void mul(float result[4][4], float a[4][4], float b[4][4]) { __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]); __m128 otherRow1 __attribute__((aligned(16))) = _mm_loadu_ps(b[1]); __m128 otherRow2 __attribute__((aligned(16))) = _mm_loadu_ps(b[2]); __m128 otherRow3 __attribute__((aligned(16))) = _mm_loadu_ps(b[3]); __m128 newRow0 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0])); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1]))); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2]))); newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3]))); __m128 newRow1 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0])); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1]))); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2]))); newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3]))); __m128 newRow2 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0])); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1]))); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2]))); newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3]))); __m128 newRow3 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0])); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1]))); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2]))); newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3]))); _mm_store_ps(result[0], newRow0); _mm_store_ps(result[1], newRow1); _mm_store_ps(result[2], newRow2); _mm_store_ps(result[3], newRow3); }