Skip to content

Instantly share code, notes, and snippets.

@pendingchaos
Last active September 1, 2025 09:56
Show Gist options
  • Save pendingchaos/0fe82d6d264cb68cb5f4 to your computer and use it in GitHub Desktop.
Save pendingchaos/0fe82d6d264cb68cb5f4 to your computer and use it in GitHub Desktop.

Revisions

  1. pendingchaos revised this gist May 15, 2016. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions matrix_mul.cpp
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    #include <mmintrin.h>
    #include <xmmintrin.h>

    //~2.4x faster than non-SSE unrolled version.
    //Uses row-major order (D3D or non-OpenGL layout).
    @@ -29,8 +29,8 @@ void mul(float result[4][4], float a[4][4], float b[4][4])
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2])));
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3])));

    _mm_store_ps(result[0], newRow0);
    _mm_store_ps(result[1], newRow1);
    _mm_store_ps(result[2], newRow2);
    _mm_store_ps(result[3], newRow3);
    _mm_storeu_ps(result[0], newRow0);
    _mm_storeu_ps(result[1], newRow1);
    _mm_storeu_ps(result[2], newRow2);
    _mm_storeu_ps(result[3], newRow3);
    }
  2. pendingchaos revised this gist Apr 23, 2016. 1 changed file with 8 additions and 8 deletions.
    16 changes: 8 additions & 8 deletions matrix_mul.cpp
    Original file line number Diff line number Diff line change
    @@ -4,27 +4,27 @@
    //Uses row-major order (D3D or non-OpenGL layout).
    void mul(float result[4][4], float a[4][4], float b[4][4])
    {
    __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]);
    __m128 otherRow1 __attribute__((aligned(16))) = _mm_loadu_ps(b[1]);
    __m128 otherRow2 __attribute__((aligned(16))) = _mm_loadu_ps(b[2]);
    __m128 otherRow3 __attribute__((aligned(16))) = _mm_loadu_ps(b[3]);
    __m128 otherRow0 = _mm_loadu_ps(b[0]);
    __m128 otherRow1 = _mm_loadu_ps(b[1]);
    __m128 otherRow2 = _mm_loadu_ps(b[2]);
    __m128 otherRow3 = _mm_loadu_ps(b[3]);

    __m128 newRow0 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0]));
    __m128 newRow0 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0]));
    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1])));
    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2])));
    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3])));

    __m128 newRow1 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0]));
    __m128 newRow1 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0]));
    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1])));
    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2])));
    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3])));

    __m128 newRow2 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0]));
    __m128 newRow2 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0]));
    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1])));
    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2])));
    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3])));

    __m128 newRow3 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0]));
    __m128 newRow3 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0]));
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1])));
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2])));
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3])));
  3. pendingchaos revised this gist Oct 25, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion matrix_mul.cpp
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    #include <smmintrin.h>
    #include <mmintrin.h>

    //~2.4x faster than non-SSE unrolled version.
    //Uses row-major order (D3D or non-OpenGL layout).
  4. pendingchaos revised this gist Oct 25, 2015. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions matrix_mul.cpp
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,7 @@
    #include <smmintrin.h>

    //~2.4x faster than non-SSE unrolled version.
    //Uses row-major order (D3D or non-OpenGL layout).
    void mul(float result[4][4], float a[4][4], float b[4][4])
    {
    __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]);
  5. pendingchaos renamed this gist Oct 25, 2015. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  6. pendingchaos revised this gist Oct 25, 2015. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,5 @@
    #include <smmintrin.h>

    //~2.4x faster than non-SSE unrolled version.
    void mul(float result[4][4], float a[4][4], float b[4][4])
    {
  7. pendingchaos revised this gist Oct 25, 2015. No changes.
  8. pendingchaos created this gist Oct 25, 2015.
    33 changes: 33 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,33 @@
    //~2.4x faster than non-SSE unrolled version.
    void mul(float result[4][4], float a[4][4], float b[4][4])
    {
    __m128 otherRow0 __attribute__((aligned(16))) = _mm_loadu_ps(b[0]);
    __m128 otherRow1 __attribute__((aligned(16))) = _mm_loadu_ps(b[1]);
    __m128 otherRow2 __attribute__((aligned(16))) = _mm_loadu_ps(b[2]);
    __m128 otherRow3 __attribute__((aligned(16))) = _mm_loadu_ps(b[3]);

    __m128 newRow0 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0]));
    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1])));
    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2])));
    newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3])));

    __m128 newRow1 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0]));
    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1])));
    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2])));
    newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3])));

    __m128 newRow2 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0]));
    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1])));
    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2])));
    newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3])));

    __m128 newRow3 __attribute__((aligned(16))) = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0]));
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1])));
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2])));
    newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3])));

    _mm_store_ps(result[0], newRow0);
    _mm_store_ps(result[1], newRow1);
    _mm_store_ps(result[2], newRow2);
    _mm_store_ps(result[3], newRow3);
    }