tannergooding · September 2, 2022 09:14 · Jul 28, 2020 · Jul 28, 2020 · Jul 28, 2020
diff --git a/gistfile1.txt → Perf.md b/gistfile1.txt → Perf.md
diff --git a/MathF.Sin.cs b/MathF.Sin.cs
@@ -59,128 +59,144 @@
 
 public static float Sin(float x)
 {
-    return SoftwareFallback(x);
+    double result = x;
 
-    static float SoftwareFallback(float x)
+    if (float.IsFinite(x))
     {
-        double result = x;
+        double ax = Math.Abs(x);
 
-        if (float.IsFinite(x))
+        if (ax <= PiOverFour)
         {
-            double ax = Math.Abs(x);
-
-            if (ax <= PiOverFour)
+            if (ax >= TwoPowNegSeven)
             {
-                if (ax >= TwoPowNegSeven)
-                {
-                    result = SinTaylorSeriesFourIterations(x);
-                }
-                else if (ax >= TwoPowNegThirteen)
-                {
-                    result = SinTaylorSeriesOneIteration(x);
-                }
-                else
-                {
-                    result = x;
-                }
+                result = SinTaylorSeriesFourIterations(x);
+            }
+            else if (ax >= TwoPowNegThirteen)
+            {
+                result = SinTaylorSeriesOneIteration(x);
             }
             else
             {
-                if (float.IsNegative(x))
-                {
-                    x = -x;
-                }
+                result = x;
+            }
+        }
+        else
+        {
+            int wasNegative = 0;
 
-                if (x < 16000000.0)
-                {
-                    // Reduce x to be in the range of -(PI / 4) to (PI / 4), inclusive
+            if (float.IsNegative(x))
+            {
+                x = -x;
+                wasNegative = 1;
+            }
 
-                    // This is done by subtracting multiples of (PI / 2). Double-precision
-                    // isn't quite accurate enough and introduces some error, but we account
-                    // for that using a tail value that helps account for this.
+            int region;
 
-                    long axExp = BitConverter.DoubleToInt64Bits(ax) >> 52;
+            if (x < 16000000.0)
+            {
+                // Reduce x to be in the range of -(PI / 4) to (PI / 4), inclusive
 
-                    int region = (int)(x * TwoOverPi + 0.5);
-                    double piOverTwoCount = region;
+                // This is done by subtracting multiples of (PI / 2). Double-precision
+                // isn't quite accurate enough and introduces some error, but we account
+                // for that using a tail value that helps account for this.
 
-                    double rHead = x - (piOverTwoCount * PiOverTwoPartOne);
-                    double rTail = (piOverTwoCount * PiOverTwoPartOneTail);
+                long axExp = BitConverter.DoubleToInt64Bits(ax) >> 52;
 
-                    double r = rHead - rTail;
-                    long rExp = (BitConverter.DoubleToInt64Bits(r) << 1) >> 53;
+                region = (int)(x * TwoOverPi + 0.5);
+                double piOverTwoCount = region;
 
-                    if ((axExp - rExp) > 15)
-                    {
-                        // The remainder is pretty small compared with x, which implies that x is
-                        // near a multiple of (PI / 2). That is, x matches the multiple to at least
-                        // 15 bits and so we perform an additional fixup to account for any error
+                double rHead = x - (piOverTwoCount * PiOverTwoPartOne);
+                double rTail = (piOverTwoCount * PiOverTwoPartOneTail);
+
+                double r = rHead - rTail;
+                long rExp = (BitConverter.DoubleToInt64Bits(r) << 1) >> 53;
 
-                        r = rHead;
+                if ((axExp - rExp) > 15)
+                {
+                    // The remainder is pretty small compared with x, which implies that x is
+                    // near a multiple of (PI / 2). That is, x matches the multiple to at least
+                    // 15 bits and so we perform an additional fixup to account for any error
 
-                        rTail = (piOverTwoCount * PiOverTwoPartTwo);
-                        rHead = r - rTail;
-                        rTail = (piOverTwoCount * PiOverTwoPartTwoTail) - ((r - rHead) - rTail);
+                    r = rHead;
 
-                        r = rHead - rTail;
-                    }
+                    rTail = (piOverTwoCount * PiOverTwoPartTwo);
+                    rHead = r - rTail;
+                    rTail = (piOverTwoCount * PiOverTwoPartTwoTail) - ((r - rHead) - rTail);
+
+                    r = rHead - rTail;
+                }
 
-                    if (rExp >= 0x3F2)      // r >= 2^-13
+                if (rExp >= 0x3F2)      // r >= 2^-13
+                {
+                    if ((region & 1) == 0)  // region 0 or 2
                     {
-                        if ((region & 1) == 0)  // region 0 or 2
-                        {
-                            result = SinTaylorSeriesFourIterations(r);
-                        }
-                        else                    // region 1 or 3
-                        {
-                            result = CosTaylorSeriesFourIterations(r);
-                        }
+                        result = SinTaylorSeriesFourIterations(r);
                     }
-                    else if (rExp > 0x3DE)  // r > 1.1641532182693481E-10
+                    else                    // region 1 or 3
+                    {
+                        result = CosTaylorSeriesFourIterations(r);
+                    }
+                }
+                else if (rExp > 0x3DE)  // r > 1.1641532182693481E-10
+                {
+                    if ((region & 1) == 0)  // region 0 or 2
                     {
-                        if ((region & 1) == 0)  // region 0 or 2
-                        {
-                            result = SinTaylorSeriesOneIteration(r);
-                        }
-                        else                    // region 1 or 3
-                        {
-                            result = CosTaylorSeriesOneIteration(r);
-
-                        }
+                        result = SinTaylorSeriesOneIteration(r);
                     }
-                    else
+                    else                    // region 1 or 3
                     {
-                        if ((region & 1) == 0)  // region 0 or 2
-                        {
-                            result = r;
-                        }
-                        else                    // region 1 or 3
-                        {
-                            result = 1;
-                        }
+                        result = CosTaylorSeriesOneIteration(r);
+
                     }
                 }
                 else
                 {
-                    double r = ReduceForLargeInput(x, out int region);
-
                     if ((region & 1) == 0)  // region 0 or 2
                     {
-                        result = SinTaylorSeriesFourIterations(r);
+                        result = r;
                     }
                     else                    // region 1 or 3
                     {
-                        result = CosTaylorSeriesFourIterations(r);
+                        result = 1;
                     }
                 }
+            }
+            else
+            {
+                double r = ReduceForLargeInput(x, out region);
+
+                if ((region & 1) == 0)  // region 0 or 2
+                {
+                    result = SinTaylorSeriesFourIterations(r);
+                }
+                else                    // region 1 or 3
+                {
+                    result = CosTaylorSeriesFourIterations(r);
+                }
+            }
 
-                // Lsinf_sse2_adjust_region
+            region >>= 1;
+
+            int tmp1 = region & wasNegative;
+
+            region = ~region;
+            wasNegative = ~wasNegative;
+
+            int tmp2 = region & wasNegative;
+
+            if (((tmp1 | tmp2) & 1) == 0)
+            {
+                // If the original region was 0/1 and arg is negative, then we negate the result.
+                // -or-
+                // If the original region was 2/3 and arg is positive, then we negate the result.
+
+                result = -result;
             }
         }
-
-        return (float)result;
     }
 
+    return (float)result;
+
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     static double CosTaylorSeriesOneIteration(double x1)
     {

diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,6 @@
+|            Test Env | Method |     Mean |    Error |   StdDev |   Median |      Min |      Max | Gen 0 | Gen 1 | Gen 2 | Allocated |
+|-------------------- |------- |---------:|---------:|---------:|---------:|---------:|---------:|------:|------:|------:|----------:|
+|  x64 Windows Native |    Sin | 22.82 us | 0.015 us | 0.014 us | 22.82 us | 22.80 us | 22.85 us |     - |     - |     - |         - |
+|  x86 Windows Native |    Sin | 60.48 us | 0.087 us | 0.068 us | 60.48 us | 60.38 us | 60.60 us |     - |     - |     - |         - |
+| x64 Windows Managed |    Sin | 21.26 us | 0.305 us | 0.285 us | 21.27 us | 20.85 us | 21.65 us |     - |     - |     - |         - |
+| x86 Windows Managed |    Sin | 38.10 us | 0.390 us | 0.346 us | 38.06 us | 37.66 us | 38.71 us |     - |     - |     - |         - |
diff --git a/MathF.Sin.cs b/MathF.Sin.cs
@@ -0,0 +1,358 @@
+// This is a port of the amd/win-libm implementation provided in assembly here: https://github.com/amd/win-libm/blob/master/sinf.asm
+// The original source is Copyright (c) 2002-2019 Advanced Micro Devices, Inc. and provided under the MIT License.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this Software and associated documentaon files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+const double PiOverTwo = 1.5707963267948966;
+const double PiOverTwoPartOne = 1.5707963267341256;
+const double PiOverTwoPartOneTail = 6.077100506506192E-11;
+const double PiOverTwoPartTwo = 6.077100506303966E-11;
+const double PiOverTwoPartTwoTail = 2.0222662487959506E-21;
+const double PiOverFour = 0.7853981633974483;
+const double TwoOverPi = 0.6366197723675814;
+const double TwoPowNegSeven = 0.0078125;
+const double TwoPowNegThirteen = 0.0001220703125;
+
+const double C0 = -1.0 / 2.0;       // 1 / 2!
+const double C1 = +1.0 / 24.0;      // 1 / 4!
+const double C2 = -1.0 / 720.0;     // 1 / 6!
+const double C3 = +1.0 / 40320.0;   // 1 / 8!
+const double C4 = -1.0 / 3628800.0; // 1 / 10!
+
+const double S1 = -1.0 / 6.0;       // 1 / 3!
+const double S2 = +1.0 / 120.0;     // 1 / 5!
+const double S3 = -1.0 / 5040.0;    // 1 / 7!
+const double S4 = +1.0 / 362880.0;  // 1 / 9!
+
+private static readonly long[] PiBits = new long[]
+{
+    0,
+    5215,
+    13000023176,
+    11362338026,
+    67174558139,
+    34819822259,
+    10612056195,
+    67816420731,
+    57840157550,
+    19558516809,
+    50025467026,
+    25186875954,
+    18152700886
+};
+
+public static float Sin(float x)
+{
+    return SoftwareFallback(x);
+
+    static float SoftwareFallback(float x)
+    {
+        double result = x;
+
+        if (float.IsFinite(x))
+        {
+            double ax = Math.Abs(x);
+
+            if (ax <= PiOverFour)
+            {
+                if (ax >= TwoPowNegSeven)
+                {
+                    result = SinTaylorSeriesFourIterations(x);
+                }
+                else if (ax >= TwoPowNegThirteen)
+                {
+                    result = SinTaylorSeriesOneIteration(x);
+                }
+                else
+                {
+                    result = x;
+                }
+            }
+            else
+            {
+                if (float.IsNegative(x))
+                {
+                    x = -x;
+                }
+
+                if (x < 16000000.0)
+                {
+                    // Reduce x to be in the range of -(PI / 4) to (PI / 4), inclusive
+
+                    // This is done by subtracting multiples of (PI / 2). Double-precision
+                    // isn't quite accurate enough and introduces some error, but we account
+                    // for that using a tail value that helps account for this.
+
+                    long axExp = BitConverter.DoubleToInt64Bits(ax) >> 52;
+
+                    int region = (int)(x * TwoOverPi + 0.5);
+                    double piOverTwoCount = region;
+
+                    double rHead = x - (piOverTwoCount * PiOverTwoPartOne);
+                    double rTail = (piOverTwoCount * PiOverTwoPartOneTail);
+
+                    double r = rHead - rTail;
+                    long rExp = (BitConverter.DoubleToInt64Bits(r) << 1) >> 53;
+
+                    if ((axExp - rExp) > 15)
+                    {
+                        // The remainder is pretty small compared with x, which implies that x is
+                        // near a multiple of (PI / 2). That is, x matches the multiple to at least
+                        // 15 bits and so we perform an additional fixup to account for any error
+
+                        r = rHead;
+
+                        rTail = (piOverTwoCount * PiOverTwoPartTwo);
+                        rHead = r - rTail;
+                        rTail = (piOverTwoCount * PiOverTwoPartTwoTail) - ((r - rHead) - rTail);
+
+                        r = rHead - rTail;
+                    }
+
+                    if (rExp >= 0x3F2)      // r >= 2^-13
+                    {
+                        if ((region & 1) == 0)  // region 0 or 2
+                        {
+                            result = SinTaylorSeriesFourIterations(r);
+                        }
+                        else                    // region 1 or 3
+                        {
+                            result = CosTaylorSeriesFourIterations(r);
+                        }
+                    }
+                    else if (rExp > 0x3DE)  // r > 1.1641532182693481E-10
+                    {
+                        if ((region & 1) == 0)  // region 0 or 2
+                        {
+                            result = SinTaylorSeriesOneIteration(r);
+                        }
+                        else                    // region 1 or 3
+                        {
+                            result = CosTaylorSeriesOneIteration(r);
+
+                        }
+                    }
+                    else
+                    {
+                        if ((region & 1) == 0)  // region 0 or 2
+                        {
+                            result = r;
+                        }
+                        else                    // region 1 or 3
+                        {
+                            result = 1;
+                        }
+                    }
+                }
+                else
+                {
+                    double r = ReduceForLargeInput(x, out int region);
+
+                    if ((region & 1) == 0)  // region 0 or 2
+                    {
+                        result = SinTaylorSeriesFourIterations(r);
+                    }
+                    else                    // region 1 or 3
+                    {
+                        result = CosTaylorSeriesFourIterations(r);
+                    }
+                }
+
+                // Lsinf_sse2_adjust_region
+            }
+        }
+
+        return (float)result;
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    static double CosTaylorSeriesOneIteration(double x1)
+    {
+        // 1 - (x^2 / 2!)
+        double x2 = x1 * x1;
+        return 1.0 + (x2 * C1);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    static double CosTaylorSeriesFourIterations(double x1)
+    {
+        // 1 - (x^2 / 2!) + (x^4 / 4!) - (x^6 / 6!) + (x^8 / 8!) - (x^10 / 10!)
+
+        double x2 = x1 * x1;
+        double x4 = x2 * x2;
+
+        return 1.0 + (x2 * C0) + (x4 * ((C1 + (x2 * C2)) + (x4 * (C3 + (x2 * C4)))));
+    }
+
+    static unsafe double ReduceForLargeInput(double x, out int region)
+    {
+        Debug.Assert(!double.IsNegative(x));
+
+        // This method simulates multi-precision floating-point
+        // arithmetic and is accurate for all 1 <= x < infinity
+
+        const int BitsPerIteration = 36;
+        long ux = BitConverter.DoubleToInt64Bits(x);
+
+        int xExp = (int)(((ux & 0x7FF0000000000000) >> 52) - 1023);
+        ux = ((ux & 0x000FFFFFFFFFFFFF) | 0x0010000000000000) >> 29;
+
+        // Now ux is the mantissa bit pattern of x as a long integer
+        long mask = 1;
+        mask = (mask << BitsPerIteration) - 1;
+
+        // Set first and last to the positions of the first and last chunks of (2 / PI) that we need
+        int first = xExp / BitsPerIteration;
+        int resultExp = xExp - (first * BitsPerIteration);
+
+        // 120 is the theoretical maximum number of bits (actually
+        // 115 for IEEE single precision) that we need to extract
+        // from the middle of (2 / PI) to compute the reduced argument
+        // accurately enough for our purposes
+
+        int last = first + (120 / BitsPerIteration);
+
+        // Unroll the loop. This is only correct because we know that bitsper is fixed as 36.
+
+        long* result = stackalloc long[10];
+        long u, carry;
+
+        result[4] = 0;
+        u = PiBits[last] * ux;
+
+        result[3] = u & mask;
+        carry = u >> BitsPerIteration;
+        u = PiBits[last - 1] * ux + carry;
+
+        result[2] = u & mask;
+        carry = u >> BitsPerIteration;
+        u = PiBits[last - 2] * ux + carry;
+
+        result[1] = u & mask;
+        carry = u >> BitsPerIteration;
+        u = PiBits[first] * ux + carry;
+
+        result[0] = u & mask;
+
+        // Reconstruct the result
+        int ltb = (int)((((result[0] << BitsPerIteration) | result[1]) >> (BitsPerIteration - 1 - resultExp)) & 7);
+
+        long mantissa;
+        long nextBits;
+
+        // determ says whether the fractional part is >= 0.5
+        bool determ = (ltb & 1) != 0;
+
+        int i = 1;
+
+        if (determ)
+        {
+            // The mantissa is >= 0.5. We want to subtract it from 1.0 by negating all the bits
+            region = ((ltb >> 1) + 1) & 3;
+
+            mantissa = 1;
+            mantissa = ~(result[1]) & ((mantissa << (BitsPerIteration - resultExp)) - 1);
+
+            while (mantissa < 0x0000000000010000)
+            {
+                i++;
+                mantissa = (mantissa << BitsPerIteration) | (~(result[i]) & mask);
+            }
+
+            nextBits = (~(result[i + 1]) & mask);
+        }
+        else
+        {
+            region = (ltb >> 1);
+
+            mantissa = 1;
+            mantissa = result[1] & ((mantissa << (BitsPerIteration - resultExp)) - 1);
+
+            while (mantissa < 0x0000000000010000)
+            {
+                i++;
+                mantissa = (mantissa << BitsPerIteration) | result[i];
+            }
+
+            nextBits = result[i + 1];
+        }
+
+        // Normalize the mantissa.
+        // The shift value 6 here, determined by trial and error, seems to give optimal speed.
+
+        int bc = 0;
+
+        while (mantissa < 0x0000400000000000)
+        {
+            bc += 6;
+            mantissa <<= 6;
+        }
+
+        while (mantissa < 0x0010000000000000)
+        {
+            bc++;
+            mantissa <<= 1;
+        }
+
+        mantissa |= nextBits >> (BitsPerIteration - bc);
+
+        int rExp = 52 + resultExp - bc - i * BitsPerIteration;
+
+        // Put the result exponent rexp onto the mantissa pattern
+        u = (rExp + 1023L) << 52;
+        ux = (mantissa & 0x000FFFFFFFFFFFFF) | u;
+
+        if (determ)
+        {
+            // If we negated the mantissa we negate x too
+            ux |= unchecked((long)(0x8000000000000000));
+        }
+
+        x = BitConverter.Int64BitsToDouble(ux);
+
+        // x is a double precision version of the fractional part of
+        // (x * (2 / PI)). Multiply x by (PI / 2) in double precision
+        // to get the reduced result.
+
+        return x * PiOverTwo;
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    static double SinTaylorSeriesOneIteration(double x1)
+    {
+        // x - (x^3 / 3!)
+
+        double x2 = x1 * x1;
+        double x3 = x2 * x1;
+
+        return x1 + (x3 * S1);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    static double SinTaylorSeriesFourIterations(double x1)
+    {
+        // x - (x^3 / 3!) + (x^5 / 5!) - (x^7 / 7!) + (x^9 / 9!)
+
+        double x2 = x1 * x1;
+        double x3 = x2 * x1;
+        double x4 = x2 * x2;
+
+        return x1 + ((S1 + (x2 * S2) + (x4 * (S3 + (x2 * S4)))) * x3);
+    }
+}
No results found