Skip to content

Instantly share code, notes, and snippets.

@VamuveTV
Forked from ShaderManager/SSE Sort
Created July 22, 2020 12:14
Show Gist options
  • Save VamuveTV/d5d0d3ee9eb8c3db6b82eba0cb7f36fc to your computer and use it in GitHub Desktop.
Save VamuveTV/d5d0d3ee9eb8c3db6b82eba0cb7f36fc to your computer and use it in GitHub Desktop.

Revisions

  1. @ShaderManager ShaderManager revised this gist Apr 20, 2015. 1 changed file with 6 additions and 15 deletions.
    21 changes: 6 additions & 15 deletions SSE Sort
    Original file line number Diff line number Diff line change
    @@ -1,19 +1,8 @@
    static inline __m128 vselect(const __m128 a, const __m128 b, const __m128 mask)
    {
    return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
    }

    static inline __m128i vselect(const __m128i a, const __m128i b, const __m128i mask)
    {
    return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
    }

    /*
    Sort 4 floats in SSE vector using sorting network and return indices of moved values
    */
    inline __m128i v4_sort(__m128& v)
    {
    // @todo Replace by constants?
    auto i = _mm_castsi128_ps(_mm_set_epi32(3, 2, 1, 0));
    const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0xFFFFFFFC));

    @@ -26,24 +15,26 @@ inline __m128i v4_sort(__m128& v)
    auto cmp = _mm_cmplt_ps(v, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(1, 0, 1, 0));
    auto temp2 = vselect(v, temp, cmp);
    auto temp2 = _mm_blendv_ps(temp, v, cmp);

    // Second pass
    temp = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 3, 0, 1));
    cmp = _mm_cmplt_ps(temp2, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(2, 0, 2, 0));

    temp2 = vselect(temp2, temp, cmp);
    temp2 = _mm_blendv_ps(temp, temp2, cmp);

    // Third pass
    temp = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 1, 2, 0));
    cmp = _mm_cmplt_ps(temp2, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(3, 1, 1, 0));

    v = vselect(temp2, temp, cmp);
    v = _mm_blendv_ps(temp, temp2, cmp);

    // Remove indices from input and return them
    return _mm_castps_si128(_mm_andnot_ps(mask, v));
    auto ret = _mm_castps_si128(_mm_andnot_ps(mask, v));

    return ret;
    }
  2. @ShaderManager ShaderManager revised this gist Apr 20, 2015. 1 changed file with 2 additions and 4 deletions.
    6 changes: 2 additions & 4 deletions SSE Sort
    Original file line number Diff line number Diff line change
    @@ -13,6 +13,7 @@ Sort 4 floats in SSE vector using sorting network and return indices of moved va
    */
    inline __m128i v4_sort(__m128& v)
    {
    // @todo Replace by constants?
    auto i = _mm_castsi128_ps(_mm_set_epi32(3, 2, 1, 0));
    const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0xFFFFFFFC));

    @@ -44,8 +45,5 @@ inline __m128i v4_sort(__m128& v)
    v = vselect(temp2, temp, cmp);

    // Remove indices from input and return them
    auto ret = _mm_castps_si128(_mm_andnot_ps(mask, v));
    v = _mm_and_ps(mask, v);

    return ret;
    return _mm_castps_si128(_mm_andnot_ps(mask, v));
    }
  3. @ShaderManager ShaderManager revised this gist Apr 20, 2015. 1 changed file with 20 additions and 15 deletions.
    35 changes: 20 additions & 15 deletions SSE Sort
    Original file line number Diff line number Diff line change
    @@ -9,38 +9,43 @@ static inline __m128i vselect(const __m128i a, const __m128i b, const __m128i ma
    }

    /*
    Sort 4 floats in SSE vector using sorting network and return indices of moved values
    Sort 4 floats in SSE vector using sorting network and return indices of moved values
    */
    inline __m128i v4_sort(__m128& v)
    {
    auto i = _mm_set_epi32(3, 2, 1, 0);

    auto i = _mm_castsi128_ps(_mm_set_epi32(3, 2, 1, 0));
    const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0xFFFFFFFC));

    // Place indices in lower 2 bits of mantissa
    v = _mm_or_ps(_mm_and_ps(v, mask), i);

    // Simple sorting network for n=4
    // First pass
    auto temp = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 0, 3, 2));
    auto cmp = _mm_cmplt_ps(v, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(1, 0, 1, 0));
    auto temp2 = vselect(v, temp, cmp);
    // Swap indices
    i = vselect(i, _mm_shuffle_epi32(i, _MM_SHUFFLE(1, 0, 3, 2)), _mm_castps_si128(cmp));


    // Second pass
    temp = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 3, 0, 1));
    cmp = _mm_cmplt_ps(temp2, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(2, 0, 2, 0));

    temp2 = vselect(temp2, temp, cmp);
    i = vselect(i, _mm_shuffle_epi32(i, _MM_SHUFFLE(2, 3, 0, 1)), _mm_castps_si128(cmp));


    // Third pass
    temp = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 1, 2, 0));
    cmp = _mm_cmplt_ps(temp2, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(3, 1, 1, 0));

    v = vselect(temp2, temp, cmp);

    return vselect(i, _mm_shuffle_epi32(i, _MM_SHUFFLE(3, 1, 2, 0)), _mm_castps_si128(cmp));

    // Remove indices from input and return them
    auto ret = _mm_castps_si128(_mm_andnot_ps(mask, v));
    v = _mm_and_ps(mask, v);

    return ret;
    }
  4. @ShaderManager ShaderManager created this gist Apr 20, 2015.
    46 changes: 46 additions & 0 deletions SSE Sort
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,46 @@
    static inline __m128 vselect(const __m128 a, const __m128 b, const __m128 mask)
    {
    return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
    }

    static inline __m128i vselect(const __m128i a, const __m128i b, const __m128i mask)
    {
    return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
    }

    /*
    Sort 4 floats in SSE vector using sorting network and return indices of moved values
    */
    inline __m128i v4_sort(__m128& v)
    {
    auto i = _mm_set_epi32(3, 2, 1, 0);

    // Simple sorting network for n=4
    // First pass
    auto temp = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 0, 3, 2));
    auto cmp = _mm_cmplt_ps(v, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(1, 0, 1, 0));
    auto temp2 = vselect(v, temp, cmp);
    // Swap indices
    i = vselect(i, _mm_shuffle_epi32(i, _MM_SHUFFLE(1, 0, 3, 2)), _mm_castps_si128(cmp));

    // Second pass
    temp = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 3, 0, 1));
    cmp = _mm_cmplt_ps(temp2, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(2, 0, 2, 0));

    temp2 = vselect(temp2, temp, cmp);
    i = vselect(i, _mm_shuffle_epi32(i, _MM_SHUFFLE(2, 3, 0, 1)), _mm_castps_si128(cmp));

    // Third pass
    temp = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 1, 2, 0));
    cmp = _mm_cmplt_ps(temp2, temp);

    cmp = _mm_shuffle_ps(cmp, cmp, _MM_SHUFFLE(3, 1, 1, 0));

    v = vselect(temp2, temp, cmp);

    return vselect(i, _mm_shuffle_epi32(i, _MM_SHUFFLE(3, 1, 2, 0)), _mm_castps_si128(cmp));
    }