Skip to content

Instantly share code, notes, and snippets.

@nxpatterns
Forked from jamesdiacono/masm_to_nasm.sh
Created April 14, 2023 07:33
Show Gist options
  • Save nxpatterns/ee456ed32ccb2c03164949b65b3c1b34 to your computer and use it in GitHub Desktop.
Save nxpatterns/ee456ed32ccb2c03164949b65b3c1b34 to your computer and use it in GitHub Desktop.

Revisions

  1. @jamesdiacono jamesdiacono revised this gist Mar 5, 2023. 2 changed files with 56 additions and 0 deletions.
    File renamed without changes.
    56 changes: 56 additions & 0 deletions test_darwin_silicon.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,56 @@
    #!/bin/bash

    # This script builds and runs the DEC64 tests for a Macintosh with an Apple
    # Silicon processor. It takes no arguments.

    # DEC64's ARM source must be tweaked slightly to make it compatible with clang:

    # 0. Comments start with a double slash, not a semicolon.
    # 1. The global directive, like all directives, is preceeded by a period.
    # 2. The area directive is not supported. We do, however, align.
    # 3. Labels must be suffixed with a colon.
    # 4. An 8 byte constant is declared with .quad rather than dcq.
    # 5. When shifting an address offset, an extra comma is required.
    # 6. The end directive is unnecessary.
    # 7. The names of entrypoints require a leading underscore to link correctly.

    sed \
    -E \
    -e "s_;_//_g" \
    -e "s/global ([a-z0-9_]+) \\[func\\]/.global \1/g" \
    -e "s/area dec64, align=8, code, readonly/.align 8/g" \
    -e "s/^([a-z0-9_]+)/\1:/g" \
    -e "s/dcq/.quad/g" \
    -e "s/x([0-9]) lsl 3/x\1, lsl 3/g" \
    -e "/ end/d" \
    -e "s/dec64_/_dec64_/g" \
    <dec64.s \
    >dec64.silicon.s \
    || exit

    build_and_run() {

    # Compile, assemble and link a source file into a binary executable, then
    # run it.

    src=$1
    bin=$2

    # The tests shift negative integers on purpose, so we suppress that warning.

    clang \
    -Wno-shift-negative-value \
    -o $bin \
    dec64.silicon.s \
    dec64_string.c \
    dec64_math.c \
    $src \
    && ./$bin
    }

    build_and_run dec64_test.c dec64_test \
    && build_and_run dec64_string_test.c dec64_string_test \
    && build_and_run dec64_math_test.c dec64_math_test \
    || exit

    # This script has been tested on a 2021 MacBook Pro with an M1 Pro processor.
  2. @jamesdiacono jamesdiacono revised this gist Sep 5, 2022. 1 changed file with 5 additions and 6 deletions.
    11 changes: 5 additions & 6 deletions test_android.sh
    Original file line number Diff line number Diff line change
    @@ -8,12 +8,11 @@

    # 0. Comments start with a double slash, not a semicolon.
    # 1. The global directive, like all directives, is preceeded by a period.
    # 2. The global directive, like all directives, is preceeded by a period.
    # 3. The area directive is not supported. We translate the "align".
    # 4. Labels are suffixed with a colon.
    # 5. An 8 byte constant is declared with .quad rather than dcq.
    # 6. When shifting an address offset, a comma is required before the lsl.
    # 7. The end directive is unnecessary.
    # 2. The area directive is not supported. We do, however, align.
    # 3. Labels must be suffixed with a colon.
    # 4. An 8 byte constant is declared with .quad rather than dcq.
    # 5. When shifting an address offset, an extra comma is required.
    # 6. The end directive is unnecessary.

    sed \
    -E \
  3. @jamesdiacono jamesdiacono revised this gist Sep 5, 2022. 2 changed files with 27 additions and 1259 deletions.
    1,257 changes: 0 additions & 1,257 deletions dec64.s
    Original file line number Diff line number Diff line change
    @@ -1,1257 +0,0 @@
    // dec64.s
    // 2022-09-05
    // Public Domain

    // No warranty expressed or implied. Use at your own risk. You have been warned.


    // This file contains the source assembly code for the ARM64 implementation of
    // the DEC64 library, being the elementary arithmetic operations for DEC64, a
    // decimal floating point type.

    // DEC64 uses 64 bits to represent a number. The low order 8 bits contain an
    // exponent that ranges from -127 to 127. The exponent is not biased. The
    // exponent -128 is reserved for nan (not a number). The remaining 56 bits,
    // including the sign bit, are the coefficient in the range
    // -36 028 797 018 963 968 thru 36 028 797 018 963 967. The exponent and the
    // coefficient are both twos complement signed numbers.
    //
    // The value of any non-nan DEC64 number is coefficient * (10 ** exponent).
    //
    // Rounding is to the nearest value. Ties are rounded away from zero. Integer
    // division is floored. The result of modulo has the sign of the divisor.
    // There is no negative zero.
    //
    // All 72057594037927936 values with an exponent of -128 are nan values.

    // When these functions return nan, they will always return DEC64_NULL,
    // the normal nan value.

    // These operations will produce a result of DEC64_NULL:
    //
    // dec64_abs(nan)
    // dec64_ceiling(nan)
    // dec64_floor(nan)
    // dec64_neg(nan)
    // dec64_normal(nan)
    // dec64_signum(nan)
    //
    // These operations will produce a result of zero for all values of n,
    // even if n is nan:
    //
    // dec64_divide(0, n)
    // dec64_integer_divide(0, n)
    // dec64_modulo(0, n)
    // dec64_multiply(0, n)
    // dec64_multiply(n, 0)
    //
    // These operations will produce a result of DEC64_NULL for all values of n
    // except zero:
    //
    // dec64_divide(n, 0)
    // dec64_divide(n, nan)
    // dec64_integer_divide(n, 0)
    // dec64_integer_divide(n, nan)
    // dec64_modulo(n, 0)
    // dec64_modulo(n, nan)
    // dec64_multiply(n, nan)
    // dec64_multiply(nan, n)
    //
    // These operations will produce a result of normal nan for all values of n:
    //
    // dec64_add(n, nan)
    // dec64_add(nan, n)
    // dec64_divide(nan, n)
    // dec64_integer_divide(nan, n)
    // dec64_modulo(nan, n)
    // dec64_round(nan, n)
    // dec64_subtract(n, nan)
    // dec64_subtract(nan, n)
    //
    // You know what goes great with those defective pentium chips?
    // Defective pentium salsa! (David Letterman)

    // All public symbols have a dec64_ prefix. All other symbols are private.

    .global dec64_abs
    // (number: dec64) returns absolution: dec64

    .global dec64_add
    // (augend: dec64, addend: dec64) returns sum: dec64

    .global dec64_ceiling
    // (number: dec64) returns integer: dec64

    .global dec64_coefficient
    // (number: dec64) returns coefficient: int64

    .global dec64_divide
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    .global dec64_exponent
    // (number: dec64) returns exponent: int64

    .global dec64_floor
    // (number: dec64) returns integer: dec64

    .global dec64_integer_divide
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    .global dec64_is_equal
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    .global dec64_is_false
    // (boolean: dec64) returns comparison: dec64

    .global dec64_is_integer
    // (number: dec64) returns comparison: dec64

    .global dec64_is_less
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    .global dec64_is_nan
    // (number: dec64) returns comparison: dec64

    .global dec64_is_zero
    // (number: dec64) returns comparison: dec64

    .global dec64_modulo
    // (dividend: dec64, divisor: dec64) returns modulus: dec64

    .global dec64_multiply
    // (multiplicand: dec64, multiplier: dec64) returns product: dec64

    .global dec64_neg
    // (number: dec64) returns negation: dec64

    .global dec64_new
    // (coefficient: int64, exponent: int64) returns number: dec64

    .global dec64_normal
    // (number: dec64) returns normalization: dec64

    .global dec64_round
    // (number: dec64, place: dec64) returns quantization: dec64

    .global dec64_signum
    // (number: dec64) returns signature: dec64

    .global dec64_subtract
    // (minuend: dec64, subtrahend: dec64) returns difference: dec64


    // All of the public functions in this file accept up to two arguments,
    // which are passed in registers (x0, x1), returning a result in x0.

    // Registers x0 thru x15 may be clobbered.
    // The other registers are not disturbed.
    // The stack is not touched in any way.

    .align 8

    power: // the powers of 10
    .quad 1 // 0
    .quad 10 // 1
    .quad 100 // 2
    .quad 1000 // 3
    .quad 10000 // 4
    .quad 100000 // 5
    .quad 1000000 // 6
    .quad 10000000 // 7
    .quad 100000000 // 8
    .quad 1000000000 // 9
    .quad 10000000000 // 10
    .quad 100000000000 // 11
    .quad 1000000000000 // 12
    .quad 10000000000000 // 13
    .quad 100000000000000 // 14
    .quad 1000000000000000 // 15
    .quad 10000000000000000 // 16
    .quad 100000000000000000 // 17
    .quad 1000000000000000000 // 18
    .quad 10000000000000000000// 19

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_coefficient:
    // (number: dec64) returns coefficient: int64

    // Return the coefficient part from a dec64 number.

    asr x0, x0, 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_exponent:
    // (number: dec64) returns exponent: int64

    // Return the exponent part, sign extended to 64 bits, from a dec64 number.
    // dec64_exponent(nan) returns -128.

    sxtb x0, w0
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_new:
    // (coefficient: int64, exponent: int64) returns number: dec64

    // The dec64_new function will combine the coefficient and exponent into a
    // dec64.
    // Numbers that are too tiny to be contained in this format become zero.
    // Numbers that are too huge to be contained in this format become nan.
    // Clobbers x1, x4 thru x7, x11.

    // The coefficient is in x0.
    // The exponent is in x1.

    mov x11, x1 // x11 is the exponent
    mov x1, xzr // x1 is zero

    new:

    mov x7, 10

    // If the exponent is less than -127, or if abs(coefficient) >= 2**55 then we
    // must shrink the coefficient.

    asr x4, x0, 55 // x4 is the 9 msb of exponent, sign extended
    add x4, x4, x4, lsr 63 // x4 is zero if the number fits
    sub x5, x11, -127 // x5 is negative if exponent is too negative
    orr x4, x4, x5, asr 63
    cbnz x4, new_shrink

    new_almost_done:

    // If the exponent is too large, then we must grow the coefficient.

    subs xzr, x11, 127
    b.gt new_grow

    new_done:

    // Pack the exponent and coefficient together to form a new dec64.

    cbz x0, return
    and x4, x11, 0xFF
    add x0, x4, x0, lsl 8
    ret

    new_grow:

    // The coefficient is good, but the exponent is too big.
    // We will try to grow the coefficient by mutliplying by ten.

    mul x0, x0, x7
    sub x11, x11, 1

    // Is the coefficient still in range?

    asr x4, x0, 55 // x4 is the 9 msb of exponent, sign extended
    add x4, x4, x4, lsr 63 // x4 is zero if the number fits

    // If so, we are almost done.

    cbz x4, new_almost_done

    // The number is too big to represent as a DEC64. So sad.

    b return_null

    new_shrink:

    // Divide the coefficient by 10 (remembering its old value in x6)
    // and increment the exponent.

    mov x6, x0
    sdiv x0, x0, x7
    add x11, x11, 1

    // Are the coefficient and exponent now in range? If not, keep shrinking.

    asr x4, x0, 55 // x4 is the 9 msb of exponent, sign extended
    add x4, x4, x4, lsr 63 // x4 is zero if the number fits
    sub x5, x11, -127 // x5 is negative if exponent is too negative
    orr x4, x4, x5, asr 63
    cbnz x4, new_shrink

    // Is the absolute value of the remainder greater than or equal to 5?

    msub x5, x0, x7, x6 // x5 is old coefficient - coefficient * 10
    ands xzr, x5, x5 // is the remainder negative?
    cneg x5, x5, mi // x5 is abs(remainder)
    subs xzr, x5, 5 // is the remainder 5 or more?
    b.ge new_round // rounding is required
    subs xzr, x11, 127 // is the exponent still in range?
    b.le new_done
    b return_null

    new_round:

    // If so, round the coefficient away from zero.

    asr x6, x6, 63 // x6 is the sign extension
    orr x6, x6, 1 // x6 is -1 or 1
    add x0, x0, x6 // x0 is the rounded coefficient
    b new

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_round:
    // (number: dec64, place: dec64) returns quantization: dec64

    // The place argument indicates at what decimal place to round.
    // -2 nearest cent
    // 0 nearest integer
    // 3 nearest thousand
    // 6 nearest million
    // 9 nearest billion

    // The place should be between -16 and 16.

    asr x4, x0, 8 // x4 is coefficient of number
    sxtb x11, w0 // x11 is exponent of number
    asr x6, x1, 8 // x6 is coefficient places
    sxtb x5, w1 // x5 is exponent of places
    subs xzr, x11, -128
    b.eq return_null
    cbz x4, return_zero // is number zero?

    // If places is null, use zero.

    subs xzr, x5, -128
    csel x5, xzr, x5, eq
    csel x6, xzr, x6, eq
    cbnz x5, round_normal // if places is not an integer, normalize

    subs xzr, x11, x6 // are we done?
    b.ge return
    mov x10, 10

    round_loop:

    // Increment the exponent and divide the coefficient by 10 until the target
    // exponent is reached.

    cbz x4, return_zero
    mov x5, x4 // x5 is old coefficient
    sdiv x4, x4, x10 // x4 is coefficient / 10
    add x11, x11, 1

    subs xzr, x11, x6
    b.lt round_loop

    // If abs(remainder) is 5 or more, bump the coefficient.

    msub x5, x4, x10, x5 // x5 is the remainder
    ands xzr, x4, x4 // is the number negative?
    cneg x5, x5, mi // x5 is abs(remainder)
    asr x8, x4, 63 // x8 is -1 or 0
    orr x8, x8, 1 // x8 is -1 or 1
    subs xzr, x5, 5
    csel x8, x8, xzr, ge // x8 is zero if no rounding needed
    add x0, x4, x8
    b new

    round_normal:

    // If places is not obviously an integer, then attempt to normalize it.

    mov x14, x0 // x14 is the number
    mov x15, x30 // x15 is return address
    mov x0, x1
    adr x30, 8
    b dec64_normal
    mov x30, x15
    ands xzr, x0, 0xFF
    b.ne return_null
    mov x1, x0
    mov x0, x14
    b dec64_round

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_add:
    // (augend: dec64, addend: dec64) returns sum: dec64

    // Add two dec64 numbers together.

    // If the two exponents are both zero (which is usually the case for integers)
    // we can take the fastest path. Since the exponents are both zero, we can
    // simply add the numbers together and check for overflow.
    // Clobbers x4 thru x11

    orr x5, x0, x1
    ands xzr, x5, 255
    b.ne add_begin
    adds x0, x0, x1
    b.vs add_overflow
    ret

    add_overflow:

    // The fast add overflowed. This is very uncommon. The exponent is in the bottom
    // of x1. x0 contains the scaled coefficient, missing its most significant bit.

    sub x0, x0, x1 // undo the addition
    sxtb x11, w1 // x11 is the exponent
    asr x0, x0, 8
    add x0, x0, x1, asr 8
    b new

    add_begin:

    // If the exponents are equal, then we can add fast

    sxtb x5, w0 // x5 is the first exponent
    subs xzr, x5, -128 // Make sure the augend is not nan.
    b.eq return_null
    sxtb x7, w1 // x7 is the second exponent
    bfxil x0, xzr, 0, 8 // clear the x0 exponent
    subs xzr, x5, x7 // are the exponents equal?
    b.eq add_fast

    // The exponents must be made equal before we can add.

    mov x10, 10 // x10 is 10
    asr x4, x0, 8 // x4 is the first coefficient
    asr x6, x1, 8 // x6 is the second coefficient

    add_slow:

    // Make sure that the number with the larger exponent is in (x4, x5).
    // The other goes in (x6, x7).

    subs xzr, x7, -128 // Make sure the addend is not nan.
    b.eq return_null
    subs xzr, x5, x7
    b.ge add_grow
    mov x8, x4
    mov x9, x5
    mov x4, x6
    mov x5, x7
    mov x6, x8
    mov x7, x9

    add_grow:

    // If the exponents are not equal, try growing x4.
    // First make sure there is some headroom.

    subs xzr, x5, x7
    b.eq add_ready
    asr x11, x4, 58
    eor x11, x11, x4, asr 63
    cbnz x11, add_shrink
    mul x4, x4, x10
    sub x5, x5, 1
    b add_grow

    add_shrink:

    // If the exponents are not equal yet, try shrinking x6.

    sdiv x6, x6, x10
    add x7, x7, 1
    subs xzr, x5, x7
    b.ne add_shrink

    add_ready:

    // The exponents are equal. We are ready to add.

    add x0, x4, x6
    mov x11, x5
    b new

    add_fast:

    adds x0, x0, x1
    b.vs add_overflow
    asr x10, x0, 8
    cbz x10, return_zero
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_subtract:
    // (minuend: dec64, subtrahend: dec64) returns difference: dec64

    // Subtract two dec64 by negating the subtrahend and adding. The complication
    // is that the coeffient -36028797018963968 is not like the other coefficients.

    // Negate the subtrahend coefficient without changing the exponent.

    eor x1, x1, 0xFFFFFFFFFFFFFF00
    adds x1, x1, 256 // 2s complement adds 1 after a 'not'
    b.vc dec64_add // if it did not overflow, we can add

    // Negating the subtrahend caused an overflow.
    // Set things up to jump into the add slow path.

    asr x4, x0, 8 // x4 is the first coefficient
    sxtb x5, w0 // x5 is the first exponent
    mov x6, 0x80000000000000
    sxtb x7, w1 // x7 is the second exponent
    subs xzr, x5, -128 // make sure the minuend is not nan
    b.ne add_slow
    b return_null

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_ceiling:
    // (number: dec64) returns integer: dec64

    // Produce the smallest integer that is greater than or equal to the number.
    // In the result, the exponent will be greater than or equal to zero unless
    // it is nan. Numbers with positive exponents will not be modified, even if
    // the numbers are outside of the safe integer range.

    mov x7, 1 // x7 is the incrementor (round up)
    b floor_begin

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_floor:
    // (number: dec64) returns integer: dec64

    // Produce the largest integer that is less than or equal to the number. This
    // is sometimes called the entier function. In the result, the exponent will
    // be greater than or equal to zero unless it is nan. Numbers with positive
    // exponents will not be modified, even if the numbers are outside of the safe
    // integer range.
    // Clobbers x4 thru x10

    mov x7, -1 // x7 is the incrementor (round down)

    floor_begin:

    asr x4, x0, 8 // x4 is the coefficient
    tbnz x0, 7, floor_fractional
    cbz x4, return_zero
    ret

    floor_fractional:

    sxtb x5, w0 // x5 is the exponent
    sub x5, xzr, x5 // x5 is abs(exponent)
    subs xzr, x5, 16 // is the number super dinky?
    b.gt floor_dinky
    adr x10, power
    ldr x10, [x10, x5, lsl 3] // get a power of 10
    sdiv x0, x4, x10 // x0 is coefficient / power of 10
    msub x8, x0, x10, x4 // x8 is the remainder

    // Three things determine if x0 needs to be incremented, decremented,
    // or left alone:
    // Is this 'floor' or 'ceiling' (x7)
    // Is the remainder zero (x8)
    // Is the coefficient negative (x4)

    floor_inc:

    eor x9, x4, x7 // if number & incrementor are not same sign
    bic x7, x7, x9, asr 63 // then incrementor is zero
    subs xzr, x8, xzr // if the remainder is zero
    csel x7, xzr, x7, eq // then incrementor is zero
    add x0, x0, x7
    lsl x0, x0, 8
    ret

    floor_dinky:

    subs xzr, x5, 0x80
    b.ge return_null
    mov x8, x4
    mov x0, xzr
    b floor_inc

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_multiply:
    // (multiplicand: dec64, multiplier: dec64) returns product: dec64

    // Multiply two dec64 numbers together.
    // Clobber: x4 thru x11

    asr x4, x0, 8 // x4 is the first coefficient
    sxtb x5, w0 // x5 is the first exponent
    asr x6, x1, 8 // x6 is the second coefficient
    sxtb x7, w1 // x7 is the second exponent

    subs xzr, x5, -128
    csetm x8, ne // x8 is 0 if first exponent is null
    subs xzr, x7, -128
    csel x8, x8, xzr, ne // x8 is 0 if either exponent is null
    cbz x8, multiply_null

    ands xzr, x4, x4
    csetm x8, ne // x8 is 0 if first coefficient is 0
    ands xzr, x6, x6
    csel x8, x8, xzr, ne // x8 is 0 if either coefficient is 0
    cbz x8, return_zero

    eor x0, x4, x6 // x0 is the sign of the result
    ands xzr, x4, x4
    cneg x4, x4, mi // x4 is abs(first coefficient)
    ands xzr, x6, x6
    cneg x6, x6, mi // x6 is abs(second coefficient)
    mul x8, x4, x6 // x8 is the low part of the product
    umulh x9, x4, x6 // x9 is the high part of the product
    add x11, x5, x7 // x11 is the new exponent

    multiply_size:

    cbnz x9, multiply_reduce // is the product too big?
    tbz x8, 63, multiply_sign // does the product fit?

    // The product coefficient contains one too many bits. Divide it by 10 and
    // increment the exponent.

    mov x10, 10
    udiv x8, x8, x10
    add x11, x11, 1

    multiply_sign:

    ands xzr, x0, x0
    cneg x0, x8, mi // x0 is signed product coefficient
    b new

    multiply_reduce:

    // The product coefficent is in two words (x9, x8). We need to get it down to
    // one word. Count the number of leading zero bits to get an estimate of the
    // number of excess digits. Then divide.

    clz x5, x9 // x5 is the number of leading zeros
    mov x4, 69 // x4 is 69 (64 + fudge)
    mov x7, 77 // x7 is 77
    sub x4, x4, x5 // x4 is number of excess bits
    mul x4, x4, x7 // x4 is x5 * 77
    lsr x4, x4, 8 // x4 is number of excess digits
    adr x10, power
    ldr x6, [x10, x4, lsl 3]// x6 is a power of ten
    add x11, x11, x4 // pump up the exponent
    clz x7, x6 // count the leading 0 in high dividend
    mov x4, 64
    sub x7, x4, x7 // x7 is sigbits in power of ten

    // x0 is the sign of the product
    // x6 is the power of ten
    // x7 is the sigbits in the power of ten
    // x9, x8 is the oversized product
    // x11 is the exponent

    b divide_big

    multiply_null:

    subs xzr, x0, 0x80
    csel x4, x0, x4, eq
    subs xzr, x1, 0x80
    csel x6, x1, x6, eq
    cbz x4, return_zero
    cbz x6, return_zero
    b return_null

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_divide:
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    // Divide a dec64 number by another.
    // Clobbers x4 thru x11.

    asr x4, x0, 8 // x4 is the dividend coefficient
    cbz x4, divide_zero // is the dividend 0?
    sxtb x5, w0 // x5 is the dividend exponent
    asr x6, x1, 8 // x6 is the divisor coefficient
    sxtb x7, w1 // x7 is the divisor exponent

    // If the divisor is zero, or if either number is nan, the result is null.

    subs xzr, x5, -128
    csetm x9, ne // x9 is 0 if dividend exponent is nan
    subs xzr, x7, -128
    csel x9, x9, xzr, ne // x9 is 0 if divisor exponent is nan
    ands xzr, x6, x6
    csel x9, x9, xzr, ne // x9 is 0 if divisor coefficient is 0
    cbz x9, return_null

    // There are a couple of interesting special cases.

    subs xzr, x1, 0x200 // is the divisor 2?
    b.eq divide_two
    subs xzr, x1, 0x100 // is the divisor 1?
    b.eq return

    // The exponent of the quotient is the difference of the input exponents.

    sub x11, x5, x7 // x11 is the quotient exponent

    // Save the sign of the quotient. We will mostly be using unsigned arithmetic.

    eor x0, x4, x6 // x0 is the sign of the quotient

    // Make the arguments positive.

    ands xzr, x4, x4
    cneg x4, x4, mi // x4 is abs(dividend coefficient)
    ands xzr, x6, x6
    cneg x6, x6, mi // x6 is abs(divisor coefficient)

    // This is a floating point divide, so we want to preserve as much information
    // in the quotient as possible. To do this, we will scale up the dividend by a
    // suitable power of ten, reducing the exponent by a comparable amount.

    clz x5, x4 // x5 is leading zeros of dividend
    mov x9, 64 // x9 is 64
    clz x7, x6 // x7 is leading zeros of divisor
    sub x5, x9, x5 // x5 is sigbits in dividend
    sub x7, x9, x7 // x7 is sigbits in divisor
    add x8, x7, 59 // x8 is sigbits needed in dividend
    sub x8, x8, x5 // x8 is additional sigbits required

    // To convert bits to digits, we multiply by log10/log2 (0.30103), which is
    // almost equal to 77/256 (0.30078).

    mov x10, 77
    mul x8, x8, x10
    lsr x8, x8, 8 // x8 is the number of digits to inflate
    sub x11, x11, x8 // subtract the new digits from the exponent

    // The number of new digits could be as great as 34, but the ARM64 multiplier
    // can only take multipliers as great as 19 digits. So it might be necessary to
    // split the multiplication into two parts.

    // The optional first part produces a product that fits in one word.

    adr x9, power // x9 is the address of the power of 10 table
    subs x10, x8, 19 // is the number of digits more than 19?
    b.le divide_inflate // if not, then a single multiply is needed
    ldr x10, [x9, x10, lsl 3] // x10 is the first power of ten
    mul x4, x4, x10 // multiply the dividend by the first part
    mov x8, 19 // the next multiply will be by 1e19

    divide_inflate:

    // Put the inflated dividend in (x9, x8).

    ldr x10, [x9, x8, lsl 3]// load the power of ten
    mul x8, x4, x10 // x8 is the low half of the dividend
    umulh x9, x4, x10 // x9 is the high half of the dividend

    // Align the dividend and divisor by their leading 1 bits.
    // How we do this depends on the size of the dividend.

    cbnz x9, divide_big

    // If the dividend has only 1 word, then shift the divisor.

    clz x5, x8 // count the leading 0 in low dividend
    mov x4, 64 // x4 is 64
    mov x9, x8 // move the low part to the high part
    sub x5, x4, x5 // x5 is sigbits in dividend
    mov x8, xzr // zero out the low part
    sub x10, x5, x7 // x10 is the countdown
    lsl x6, x6, x10 // align the divisor
    b divide_ready

    divide_big:

    // If the dividend has two words, then shift the dividend.

    clz x5, x9 // count the leading 0 in high dividend
    mov x4, 64 // x4 is 64 (word size)
    sub x5, x4, x5 // x5 is sigbits in high dividend
    sub x10, x7, x5 // x10 is left shift distance
    sub x4, x4, x10 // x4 is right shift distance
    lsl x9, x9, x10 // shift high dividend
    lsr x4, x8, x4 // x4 is the carry
    orr x9, x9, x4 // insert the carry into the high dividend
    lsl x8, x8, x10 // shift the low dividend
    add x10, x5, 64 // x10 is sigbits in whole dividend (x9, x8)
    sub x10, x10, x7 // x10 is the countdown

    divide_ready:

    mov x7, xzr // x7 is the quotient

    divide_step:

    // In each divide step:
    // Double the quotient
    // Find the difference between the aligned dividend and divisor
    // If the difference is not negative
    // Add 1 to the quotient
    // Subtract the divisor from the dividend
    // Double the dividend (x9, x8)
    // Decrement the countdown

    subs x4, x9, x6 // x4 is high dividend - divisor
    cset x5, pl // x5 is 1 if difference is positive
    csel x9, x4, x9, pl // x9 is the difference if positive
    add x7, x5, x7, lsl 1 // double quotient and + 1 if positive diff
    lsr x5, x8, 63 // x5 is carry (high bit of low dividend)
    orr x9, x5, x9, lsl 1 // shift high dividend and insert carry
    lsl x8, x8, 1 // shift low dividend
    subs x10, x10, 1 // decrement countdown
    b.pl divide_step // is it done?

    // Correct the sign and get out.

    ands xzr, x0, x0 // should it negative?
    cneg x0, x7, mi // correct the sign
    b new

    divide_zero:

    // If x0 is not 0x80, return zero.

    subs xzr, x0, 0x80
    b.ne return_zero
    b return_null

    divide_two:

    // Divide a dec64 number by two. If it is even, we can do a shift. If it is odd,
    // then we decrement the exponent and multiply the coefficient by 5.

    tbnz x4, 0, divide_two_odd // the coefficient is odd
    cbz x4, return_zero
    lsl x0, x4, 7
    bfi x0, x5, 0, 8
    ret

    divide_two_odd:

    // Multiply by 5 and divide by 10.

    sub x11, x5, 1 // decrease the exponent
    add x0, x4, x4, lsl 2 // x0 is coefficient * 5
    b new

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_integer_divide:
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    // Divide, with a floored integer result. It produces the same result as
    // dec64_floor(dec64_divide(dividend, divisor))

    // Clobbers x12. It can also clobber more via dec64_divide and dec64_floor.

    // If either exponent is not zero, or if either coefficient is negative, then do
    // it the hard way.

    orr x12, x0, x1
    ands xzr, x12, 255
    orr x12, x12, x12, asr 63
    cbnz x12, integer_divide_hard

    cbz x0, return
    asr x12, x1, 8 // x12 is the divisor
    cbz x12, return_null // divide by zero

    sdiv x0, x0, x12
    lsl x0, x0, 8
    ret

    integer_divide_hard:

    mov x12, x30 // save the return address in x12
    bl dec64_divide
    mov x30, x12 // restore the return address
    b dec64_floor // tail call

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_modulo:
    // (dividend: dec64, divisor: dec64) returns modulus: dec64

    // Modulo. It produces the same result as
    // dec64_subtract(
    // dividend,
    // dec64_multiply(
    // dec64_floor(
    // dec64_divide(
    // dividend,
    // divisor
    // )
    // ),
    // divisor
    // )
    // )

    asr x4, x0, 8 // x4 is dividend coefficient
    cbz x4, modulo_zero
    asr x6, x1, 8 // x6 is divisor coefficient
    cbz x6, return_null
    mov x14, x0 // x14 is dividend
    mov x15, x30 // x15 is return address
    adr x30, 8
    b dec64_integer_divide
    adr x30, 8
    b dec64_multiply
    mov x1, x0
    mov x0, x14
    mov x30, x15
    b dec64_subtract

    modulo_zero:

    subs xzr, x0, 0x80
    b.ne return_zero
    b return_null

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_signum:
    // (number: dec64) returns signature: dec64

    // If the number is nan, the result is nan.
    // If the number is less than zero, the result is -1.
    // If the number is zero, the result is 0.
    // If the number is greater than zero, the result is 1.

    and x5, x0, 0xFF // x5 is the exponent
    subs xzr, x5, 0x80 // is it nan?
    b.eq return_null
    adds xzr, xzr, x0, asr 8 // is the coefficient zero?
    cset x4, ne // x4 is either 1 or 0
    asr x0, x0, 63 // x0 is either -1 or 0
    orr x0, x0, x4 // x0 is either -1, 0, or 1
    lsl x0, x0, 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_abs:
    // (number: dec64) returns absolution: dec64

    // Find the absolute value of a number. If the number is negative, hand it off
    // to dec64_neg. Otherwise, return the number unless it is nan or zero.

    tbnz x0, 63, dec64_neg
    and x5, x0, 0xFF // x5 is the exponent
    subs xzr, x5, 0x80 // is it nan?
    b.eq return_null
    adds x4, xzr, x0, asr 8 // is the coefficient zero?
    csel x0, x0, x4, ne // x0 is either the number or zero
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_neg:
    // (number: dec64) returns negation: dec64

    // Negate a number.

    sxtb x11, w0 // x11 is the exponent
    subs xzr, x11, -128
    b.eq return_null
    sub x0, xzr, x0, asr 8
    b new

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_normal:
    // (number: dec64) returns normalization: dec64

    // Make the exponent as close to zero as possible without losing any
    // signficance. Usually normalization is not needed since it does not
    // materially change the value of a number.

    // Clobbers: x4 thru x7

    sxtb x5, w0 // x5 is the exponent
    tbnz x0, 7, normal_micro // is the exponent negative?
    cbz x5, return // is the number an integer?
    asr x0, x0, 8 // x0 is the coefficient
    cbz x0, return_zero // is the coefficient zero?
    mov x7, 10

    normal_grow:

    // The exponent is greater than zero. If we subtract 1 from the exponent, we
    // must multiply the coefficient by 10.

    mul x4, x0, x7
    asr x6, x4, 55
    adds xzr, x6, x6, asr 63
    b.ne normal_done
    mov x0, x4
    subs x5, x5, 1
    b.gt normal_grow
    lsl x0, x0, 8
    ret

    normal_micro:

    // The exponent is negative. Is it nan?

    subs xzr, x5, -128
    b.eq return_null
    mov x7, 10
    asr x0, x0, 8 // x0 is the coefficient

    normal_shrink:

    // If we add 1 to the exponent, we must divide the coefficient by 10.

    cbz x0, return_zero // is the coefficient zero?
    sdiv x4, x0, x7
    mul x6, x4, x7
    subs xzr, x6, x0
    b.ne normal_done
    mov x0, x4
    adds x5, x5, 1
    b.mi normal_shrink

    normal_done:

    and x5, x5, 0xFF
    add x0, x5, x0, lsl 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return:

    // Return whatever is in x0.

    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_null:

    mov x0, 0x8000000000000000
    add x0, x0, 0x80
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_one:

    mov x0, 256 // one
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_zero:

    mov x0, xzr // zero
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_equal:
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    // Compare two dec64 numbers.

    subs xzr, x0, x1 // this is the trivial case
    b.eq return_true

    // If the comparator is nan, then we pass the problem to dec64_is_nan.

    and x7, x1, 0xFF // x7 is the second exponent
    subs xzr, x7, 0x80
    b.eq dec64_is_nan

    // If the exponents are equal, then the numbers are not.
    // If the signs are not equal, then the numbers are not.

    eor x4, x0, x1
    asr x4, x4, 63 // x4 is 0 if the signs match
    and x5, x0, 0xFF // x5 is the first exponent
    subs xzr, x5, x7 // do the exponents match?
    csinv x5, x4, xzr, ne
    cbnz x5, return_false

    // Do it the hard way.

    mov x15, x30
    adr x30, is_equal_subtract_return
    b dec64_subtract

    is_equal_subtract_return:

    mov x30, x15
    cbz x0, return_true
    b return_false

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_false:
    // (boolean: dec64) returns notation: dec64

    // If the boolean is false, the result is true. Otherwise the result is false.

    mov x4, 0x8000000000000000 // the most significant bit
    add x4, x4, 0x280 // x4 is false
    add x5, x4, 0x100 // x5 is true
    subs xzr, x0, x4
    csel x0, x4, x5, ne
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_integer:
    // (number: dec64) returns comparison: dec64

    // If the number contains a non-zero fractional part or if it is nan, return
    // false. Otherwise, return true.

    tbz x0, 7, return_true
    sxtb x5, w0 // x5 is the exponent
    sub x5, xzr, x5
    subs xzr, x5, 17
    b.gt return_false
    adr x7, power
    ldr x7, [x7, x5, lsl 3]
    asr x0, x0, 8
    sdiv x4, x0, x7
    mul x4, x4, x7
    subs xzr, x0, x4
    b.eq return_true
    b return_false

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_less:
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    // Compare two dec64 numbers. If the first is less than the second, return true,
    // otherwise return false.
    // Clobbers: x4 thru x11

    // The other 3 comparison functions are easily implemented with dec64_is_less:

    // dec64_is_greater(a, b) => dec64_is_less(b, a)
    // dec64_is_greater_or_equal(a, b) => dec64_is_false(dec64_is_less(a, b))
    // dec64_is_less_or_equal(a, b) => dec64_is_false(dec64_is_less(b, a))

    // If the exponents are the same, then do a simple compare. nan is not less than
    // any number. If the exponents are both nan, then the result must be false
    // because nan is not less than nan.

    sxtb x5, w0 // x5 is the first exponent
    sxtb x7, w1 // x7 is the second exponent
    eor x4, x0, x1 // x4 is negative if the signs mismatch
    tbnz x4, 63, less_sign
    subs x9, x5, x7 // x9 is the exponent difference
    b.ne less_hard

    subs xzr, x0, x1 // compare the numbers
    b.ge return_false

    subs xzr, x5, -128 // is the first argument nan?
    b.ne return_true
    b return_false // nan is never less than nan

    less_sign:

    subs xzr, x5, -128 // is the first argument nan?
    b.eq return_false
    subs xzr, x7, -128 // is the second argument nan?
    b.eq return_true
    tbnz x0, 63, return_true
    b return_false

    less_hard:

    subs xzr, x5, -128 // is the first argument nan?
    b.eq return_false
    subs xzr, x7, -128 // is the second argument nan?
    b.eq return_true
    asr x6, x1, 8 // x6 is the second coefficient
    cbz x6, return_false
    adds x4, xzr, x0, asr 8 // x4 is the first coefficient
    b.eq return_true
    cneg x4, x4, mi // x4 is abs(first coefficient)
    cneg x6, x6, mi // x6 is abs(second coefficient)
    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    lsr x8, x1, 63 // x8 is 1 if numbers are negative
    eor x0, x0, x8, lsl 8 // flip x0 if the inputs are negative

    // Swap and flip if the difference of the exponents is negative.

    ands xzr, x9, x9
    b.pl less_hard_scale
    mov x11, x4
    mov x4, x6
    mov x6, x11
    sub x9, xzr, x9
    eor x0, x0, 0x100 // flip x0

    less_hard_scale:

    // The maximum possible coefficient is 36028797018963967. 10**18 is more than
    // that. If the exponential difference is large, then the coefficients do not
    // matter.

    subs xzr, x9, 18
    b.ge return

    // Amplify the first coefficient. This will make their exponents the same,
    // allowing a simple comparison.

    adr x10, power
    ldr x10, [x10, x9, lsl 3] // x10 is a power of ten
    umulh x8, x4, x10 // x8 is the high extension of the product
    cbnz x8, return // if it overflowed, it can not be less
    mul x4, x4, x10 // x4 is the scaled coefficient
    subs xzr, x4, x6 // now we can compare
    b.eq return_false
    b.hi return
    eor x0, x0, 0x100 // flip one more time
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_nan:
    // (number: dec64) returns comparison: dec64

    and x5, x0, 0xFF
    subs xzr, x5, 0x80
    cset x5, eq
    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    add x0, x0, x5, lsl 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_zero:
    // (number: dec64) returns comparison: dec64

    ands x4, x0, 0xFFFFFFFFFFFFFF00
    cset x4, eq
    and x5, x0, 0xFF
    subs xzr, x5, 0x80
    csel x4, xzr, x4, eq
    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    add x0, x0, x4, lsl 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_false:

    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_true:

    mov x0, 0x8000000000000000
    add x0, x0, 0x380 // x0 is true
    ret
    29 changes: 27 additions & 2 deletions test_android.sh
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,33 @@
    #!/bin/sh
    #!/bin/bash

    # This script builds and runs the DEC64 tests for an ARM64 Android device. It
    # takes no arguments.

    # DEC64's ARM source must be tweaked slightly to make it compatible with
    # the Android NDK's clang:

    # 0. Comments start with a double slash, not a semicolon.
    # 1. The global directive, like all directives, is preceeded by a period.
    # 2. The global directive, like all directives, is preceeded by a period.
    # 3. The area directive is not supported. We translate the "align".
    # 4. Labels are suffixed with a colon.
    # 5. An 8 byte constant is declared with .quad rather than dcq.
    # 6. When shifting an address offset, a comma is required before the lsl.
    # 7. The end directive is unnecessary.

    sed \
    -E \
    -e "s_;_//_g" \
    -e "s/global ([a-z0-9_]+) \\[func\\]/.global \1/g" \
    -e "s/area dec64, align=8, code, readonly/.align 8/g" \
    -e "s/^([a-z0-9_]+)/\1:/g" \
    -e "s/dcq/.quad/g" \
    -e "s/x([0-9]) lsl 3/x\1, lsl 3/g" \
    -e "/ end/d" \
    <dec64.s \
    >dec64.android.s \
    || exit

    build_and_run() {

    # Compile, assemble and link a source file into a binary executable, then
    @@ -18,7 +43,7 @@ build_and_run() {
    -o $bin \
    dec64_string.c \
    dec64_math.c \
    dec64.s \
    dec64.android.s \
    $src \
    && adb push ./$bin /data/local/tmp/$bin \
    && adb shell /data/local/tmp/$bin
  4. @jamesdiacono jamesdiacono revised this gist Sep 4, 2022. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions dec64.s
    Original file line number Diff line number Diff line change
    @@ -8,8 +8,6 @@
    // This file contains the source assembly code for the ARM64 implementation of
    // the DEC64 library, being the elementary arithmetic operations for DEC64, a
    // decimal floating point type.
    //
    // This file has been tested with Visual Studio 19.

    // DEC64 uses 64 bits to represent a number. The low order 8 bits contain an
    // exponent that ranges from -127 to 127. The exponent is not biased. The
  5. @jamesdiacono jamesdiacono revised this gist Sep 4, 2022. 2 changed files with 1292 additions and 0 deletions.
    1,259 changes: 1,259 additions & 0 deletions dec64.s
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,1259 @@
    // dec64.s
    // 2022-09-05
    // Public Domain

    // No warranty expressed or implied. Use at your own risk. You have been warned.


    // This file contains the source assembly code for the ARM64 implementation of
    // the DEC64 library, being the elementary arithmetic operations for DEC64, a
    // decimal floating point type.
    //
    // This file has been tested with Visual Studio 19.

    // DEC64 uses 64 bits to represent a number. The low order 8 bits contain an
    // exponent that ranges from -127 to 127. The exponent is not biased. The
    // exponent -128 is reserved for nan (not a number). The remaining 56 bits,
    // including the sign bit, are the coefficient in the range
    // -36 028 797 018 963 968 thru 36 028 797 018 963 967. The exponent and the
    // coefficient are both twos complement signed numbers.
    //
    // The value of any non-nan DEC64 number is coefficient * (10 ** exponent).
    //
    // Rounding is to the nearest value. Ties are rounded away from zero. Integer
    // division is floored. The result of modulo has the sign of the divisor.
    // There is no negative zero.
    //
    // All 72057594037927936 values with an exponent of -128 are nan values.

    // When these functions return nan, they will always return DEC64_NULL,
    // the normal nan value.

    // These operations will produce a result of DEC64_NULL:
    //
    // dec64_abs(nan)
    // dec64_ceiling(nan)
    // dec64_floor(nan)
    // dec64_neg(nan)
    // dec64_normal(nan)
    // dec64_signum(nan)
    //
    // These operations will produce a result of zero for all values of n,
    // even if n is nan:
    //
    // dec64_divide(0, n)
    // dec64_integer_divide(0, n)
    // dec64_modulo(0, n)
    // dec64_multiply(0, n)
    // dec64_multiply(n, 0)
    //
    // These operations will produce a result of DEC64_NULL for all values of n
    // except zero:
    //
    // dec64_divide(n, 0)
    // dec64_divide(n, nan)
    // dec64_integer_divide(n, 0)
    // dec64_integer_divide(n, nan)
    // dec64_modulo(n, 0)
    // dec64_modulo(n, nan)
    // dec64_multiply(n, nan)
    // dec64_multiply(nan, n)
    //
    // These operations will produce a result of normal nan for all values of n:
    //
    // dec64_add(n, nan)
    // dec64_add(nan, n)
    // dec64_divide(nan, n)
    // dec64_integer_divide(nan, n)
    // dec64_modulo(nan, n)
    // dec64_round(nan, n)
    // dec64_subtract(n, nan)
    // dec64_subtract(nan, n)
    //
    // You know what goes great with those defective pentium chips?
    // Defective pentium salsa! (David Letterman)

    // All public symbols have a dec64_ prefix. All other symbols are private.

    .global dec64_abs
    // (number: dec64) returns absolution: dec64

    .global dec64_add
    // (augend: dec64, addend: dec64) returns sum: dec64

    .global dec64_ceiling
    // (number: dec64) returns integer: dec64

    .global dec64_coefficient
    // (number: dec64) returns coefficient: int64

    .global dec64_divide
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    .global dec64_exponent
    // (number: dec64) returns exponent: int64

    .global dec64_floor
    // (number: dec64) returns integer: dec64

    .global dec64_integer_divide
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    .global dec64_is_equal
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    .global dec64_is_false
    // (boolean: dec64) returns comparison: dec64

    .global dec64_is_integer
    // (number: dec64) returns comparison: dec64

    .global dec64_is_less
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    .global dec64_is_nan
    // (number: dec64) returns comparison: dec64

    .global dec64_is_zero
    // (number: dec64) returns comparison: dec64

    .global dec64_modulo
    // (dividend: dec64, divisor: dec64) returns modulus: dec64

    .global dec64_multiply
    // (multiplicand: dec64, multiplier: dec64) returns product: dec64

    .global dec64_neg
    // (number: dec64) returns negation: dec64

    .global dec64_new
    // (coefficient: int64, exponent: int64) returns number: dec64

    .global dec64_normal
    // (number: dec64) returns normalization: dec64

    .global dec64_round
    // (number: dec64, place: dec64) returns quantization: dec64

    .global dec64_signum
    // (number: dec64) returns signature: dec64

    .global dec64_subtract
    // (minuend: dec64, subtrahend: dec64) returns difference: dec64


    // All of the public functions in this file accept up to two arguments,
    // which are passed in registers (x0, x1), returning a result in x0.

    // Registers x0 thru x15 may be clobbered.
    // The other registers are not disturbed.
    // The stack is not touched in any way.

    .align 8

    power: // the powers of 10
    .quad 1 // 0
    .quad 10 // 1
    .quad 100 // 2
    .quad 1000 // 3
    .quad 10000 // 4
    .quad 100000 // 5
    .quad 1000000 // 6
    .quad 10000000 // 7
    .quad 100000000 // 8
    .quad 1000000000 // 9
    .quad 10000000000 // 10
    .quad 100000000000 // 11
    .quad 1000000000000 // 12
    .quad 10000000000000 // 13
    .quad 100000000000000 // 14
    .quad 1000000000000000 // 15
    .quad 10000000000000000 // 16
    .quad 100000000000000000 // 17
    .quad 1000000000000000000 // 18
    .quad 10000000000000000000// 19

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_coefficient:
    // (number: dec64) returns coefficient: int64

    // Return the coefficient part from a dec64 number.

    asr x0, x0, 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_exponent:
    // (number: dec64) returns exponent: int64

    // Return the exponent part, sign extended to 64 bits, from a dec64 number.
    // dec64_exponent(nan) returns -128.

    sxtb x0, w0
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_new:
    // (coefficient: int64, exponent: int64) returns number: dec64

    // The dec64_new function will combine the coefficient and exponent into a
    // dec64.
    // Numbers that are too tiny to be contained in this format become zero.
    // Numbers that are too huge to be contained in this format become nan.
    // Clobbers x1, x4 thru x7, x11.

    // The coefficient is in x0.
    // The exponent is in x1.

    mov x11, x1 // x11 is the exponent
    mov x1, xzr // x1 is zero

    new:

    mov x7, 10

    // If the exponent is less than -127, or if abs(coefficient) >= 2**55 then we
    // must shrink the coefficient.

    asr x4, x0, 55 // x4 is the 9 msb of exponent, sign extended
    add x4, x4, x4, lsr 63 // x4 is zero if the number fits
    sub x5, x11, -127 // x5 is negative if exponent is too negative
    orr x4, x4, x5, asr 63
    cbnz x4, new_shrink

    new_almost_done:

    // If the exponent is too large, then we must grow the coefficient.

    subs xzr, x11, 127
    b.gt new_grow

    new_done:

    // Pack the exponent and coefficient together to form a new dec64.

    cbz x0, return
    and x4, x11, 0xFF
    add x0, x4, x0, lsl 8
    ret

    new_grow:

    // The coefficient is good, but the exponent is too big.
    // We will try to grow the coefficient by mutliplying by ten.

    mul x0, x0, x7
    sub x11, x11, 1

    // Is the coefficient still in range?

    asr x4, x0, 55 // x4 is the 9 msb of exponent, sign extended
    add x4, x4, x4, lsr 63 // x4 is zero if the number fits

    // If so, we are almost done.

    cbz x4, new_almost_done

    // The number is too big to represent as a DEC64. So sad.

    b return_null

    new_shrink:

    // Divide the coefficient by 10 (remembering its old value in x6)
    // and increment the exponent.

    mov x6, x0
    sdiv x0, x0, x7
    add x11, x11, 1

    // Are the coefficient and exponent now in range? If not, keep shrinking.

    asr x4, x0, 55 // x4 is the 9 msb of exponent, sign extended
    add x4, x4, x4, lsr 63 // x4 is zero if the number fits
    sub x5, x11, -127 // x5 is negative if exponent is too negative
    orr x4, x4, x5, asr 63
    cbnz x4, new_shrink

    // Is the absolute value of the remainder greater than or equal to 5?

    msub x5, x0, x7, x6 // x5 is old coefficient - coefficient * 10
    ands xzr, x5, x5 // is the remainder negative?
    cneg x5, x5, mi // x5 is abs(remainder)
    subs xzr, x5, 5 // is the remainder 5 or more?
    b.ge new_round // rounding is required
    subs xzr, x11, 127 // is the exponent still in range?
    b.le new_done
    b return_null

    new_round:

    // If so, round the coefficient away from zero.

    asr x6, x6, 63 // x6 is the sign extension
    orr x6, x6, 1 // x6 is -1 or 1
    add x0, x0, x6 // x0 is the rounded coefficient
    b new

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_round:
    // (number: dec64, place: dec64) returns quantization: dec64

    // The place argument indicates at what decimal place to round.
    // -2 nearest cent
    // 0 nearest integer
    // 3 nearest thousand
    // 6 nearest million
    // 9 nearest billion

    // The place should be between -16 and 16.

    asr x4, x0, 8 // x4 is coefficient of number
    sxtb x11, w0 // x11 is exponent of number
    asr x6, x1, 8 // x6 is coefficient places
    sxtb x5, w1 // x5 is exponent of places
    subs xzr, x11, -128
    b.eq return_null
    cbz x4, return_zero // is number zero?

    // If places is null, use zero.

    subs xzr, x5, -128
    csel x5, xzr, x5, eq
    csel x6, xzr, x6, eq
    cbnz x5, round_normal // if places is not an integer, normalize

    subs xzr, x11, x6 // are we done?
    b.ge return
    mov x10, 10

    round_loop:

    // Increment the exponent and divide the coefficient by 10 until the target
    // exponent is reached.

    cbz x4, return_zero
    mov x5, x4 // x5 is old coefficient
    sdiv x4, x4, x10 // x4 is coefficient / 10
    add x11, x11, 1

    subs xzr, x11, x6
    b.lt round_loop

    // If abs(remainder) is 5 or more, bump the coefficient.

    msub x5, x4, x10, x5 // x5 is the remainder
    ands xzr, x4, x4 // is the number negative?
    cneg x5, x5, mi // x5 is abs(remainder)
    asr x8, x4, 63 // x8 is -1 or 0
    orr x8, x8, 1 // x8 is -1 or 1
    subs xzr, x5, 5
    csel x8, x8, xzr, ge // x8 is zero if no rounding needed
    add x0, x4, x8
    b new

    round_normal:

    // If places is not obviously an integer, then attempt to normalize it.

    mov x14, x0 // x14 is the number
    mov x15, x30 // x15 is return address
    mov x0, x1
    adr x30, 8
    b dec64_normal
    mov x30, x15
    ands xzr, x0, 0xFF
    b.ne return_null
    mov x1, x0
    mov x0, x14
    b dec64_round

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_add:
    // (augend: dec64, addend: dec64) returns sum: dec64

    // Add two dec64 numbers together.

    // If the two exponents are both zero (which is usually the case for integers)
    // we can take the fastest path. Since the exponents are both zero, we can
    // simply add the numbers together and check for overflow.
    // Clobbers x4 thru x11

    orr x5, x0, x1
    ands xzr, x5, 255
    b.ne add_begin
    adds x0, x0, x1
    b.vs add_overflow
    ret

    add_overflow:

    // The fast add overflowed. This is very uncommon. The exponent is in the bottom
    // of x1. x0 contains the scaled coefficient, missing its most significant bit.

    sub x0, x0, x1 // undo the addition
    sxtb x11, w1 // x11 is the exponent
    asr x0, x0, 8
    add x0, x0, x1, asr 8
    b new

    add_begin:

    // If the exponents are equal, then we can add fast

    sxtb x5, w0 // x5 is the first exponent
    subs xzr, x5, -128 // Make sure the augend is not nan.
    b.eq return_null
    sxtb x7, w1 // x7 is the second exponent
    bfxil x0, xzr, 0, 8 // clear the x0 exponent
    subs xzr, x5, x7 // are the exponents equal?
    b.eq add_fast

    // The exponents must be made equal before we can add.

    mov x10, 10 // x10 is 10
    asr x4, x0, 8 // x4 is the first coefficient
    asr x6, x1, 8 // x6 is the second coefficient

    add_slow:

    // Make sure that the number with the larger exponent is in (x4, x5).
    // The other goes in (x6, x7).

    subs xzr, x7, -128 // Make sure the addend is not nan.
    b.eq return_null
    subs xzr, x5, x7
    b.ge add_grow
    mov x8, x4
    mov x9, x5
    mov x4, x6
    mov x5, x7
    mov x6, x8
    mov x7, x9

    add_grow:

    // If the exponents are not equal, try growing x4.
    // First make sure there is some headroom.

    subs xzr, x5, x7
    b.eq add_ready
    asr x11, x4, 58
    eor x11, x11, x4, asr 63
    cbnz x11, add_shrink
    mul x4, x4, x10
    sub x5, x5, 1
    b add_grow

    add_shrink:

    // If the exponents are not equal yet, try shrinking x6.

    sdiv x6, x6, x10
    add x7, x7, 1
    subs xzr, x5, x7
    b.ne add_shrink

    add_ready:

    // The exponents are equal. We are ready to add.

    add x0, x4, x6
    mov x11, x5
    b new

    add_fast:

    adds x0, x0, x1
    b.vs add_overflow
    asr x10, x0, 8
    cbz x10, return_zero
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_subtract:
    // (minuend: dec64, subtrahend: dec64) returns difference: dec64

    // Subtract two dec64 by negating the subtrahend and adding. The complication
    // is that the coeffient -36028797018963968 is not like the other coefficients.

    // Negate the subtrahend coefficient without changing the exponent.

    eor x1, x1, 0xFFFFFFFFFFFFFF00
    adds x1, x1, 256 // 2s complement adds 1 after a 'not'
    b.vc dec64_add // if it did not overflow, we can add

    // Negating the subtrahend caused an overflow.
    // Set things up to jump into the add slow path.

    asr x4, x0, 8 // x4 is the first coefficient
    sxtb x5, w0 // x5 is the first exponent
    mov x6, 0x80000000000000
    sxtb x7, w1 // x7 is the second exponent
    subs xzr, x5, -128 // make sure the minuend is not nan
    b.ne add_slow
    b return_null

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_ceiling:
    // (number: dec64) returns integer: dec64

    // Produce the smallest integer that is greater than or equal to the number.
    // In the result, the exponent will be greater than or equal to zero unless
    // it is nan. Numbers with positive exponents will not be modified, even if
    // the numbers are outside of the safe integer range.

    mov x7, 1 // x7 is the incrementor (round up)
    b floor_begin

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_floor:
    // (number: dec64) returns integer: dec64

    // Produce the largest integer that is less than or equal to the number. This
    // is sometimes called the entier function. In the result, the exponent will
    // be greater than or equal to zero unless it is nan. Numbers with positive
    // exponents will not be modified, even if the numbers are outside of the safe
    // integer range.
    // Clobbers x4 thru x10

    mov x7, -1 // x7 is the incrementor (round down)

    floor_begin:

    asr x4, x0, 8 // x4 is the coefficient
    tbnz x0, 7, floor_fractional
    cbz x4, return_zero
    ret

    floor_fractional:

    sxtb x5, w0 // x5 is the exponent
    sub x5, xzr, x5 // x5 is abs(exponent)
    subs xzr, x5, 16 // is the number super dinky?
    b.gt floor_dinky
    adr x10, power
    ldr x10, [x10, x5, lsl 3] // get a power of 10
    sdiv x0, x4, x10 // x0 is coefficient / power of 10
    msub x8, x0, x10, x4 // x8 is the remainder

    // Three things determine if x0 needs to be incremented, decremented,
    // or left alone:
    // Is this 'floor' or 'ceiling' (x7)
    // Is the remainder zero (x8)
    // Is the coefficient negative (x4)

    floor_inc:

    eor x9, x4, x7 // if number & incrementor are not same sign
    bic x7, x7, x9, asr 63 // then incrementor is zero
    subs xzr, x8, xzr // if the remainder is zero
    csel x7, xzr, x7, eq // then incrementor is zero
    add x0, x0, x7
    lsl x0, x0, 8
    ret

    floor_dinky:

    subs xzr, x5, 0x80
    b.ge return_null
    mov x8, x4
    mov x0, xzr
    b floor_inc

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_multiply:
    // (multiplicand: dec64, multiplier: dec64) returns product: dec64

    // Multiply two dec64 numbers together.
    // Clobber: x4 thru x11

    asr x4, x0, 8 // x4 is the first coefficient
    sxtb x5, w0 // x5 is the first exponent
    asr x6, x1, 8 // x6 is the second coefficient
    sxtb x7, w1 // x7 is the second exponent

    subs xzr, x5, -128
    csetm x8, ne // x8 is 0 if first exponent is null
    subs xzr, x7, -128
    csel x8, x8, xzr, ne // x8 is 0 if either exponent is null
    cbz x8, multiply_null

    ands xzr, x4, x4
    csetm x8, ne // x8 is 0 if first coefficient is 0
    ands xzr, x6, x6
    csel x8, x8, xzr, ne // x8 is 0 if either coefficient is 0
    cbz x8, return_zero

    eor x0, x4, x6 // x0 is the sign of the result
    ands xzr, x4, x4
    cneg x4, x4, mi // x4 is abs(first coefficient)
    ands xzr, x6, x6
    cneg x6, x6, mi // x6 is abs(second coefficient)
    mul x8, x4, x6 // x8 is the low part of the product
    umulh x9, x4, x6 // x9 is the high part of the product
    add x11, x5, x7 // x11 is the new exponent

    multiply_size:

    cbnz x9, multiply_reduce // is the product too big?
    tbz x8, 63, multiply_sign // does the product fit?

    // The product coefficient contains one too many bits. Divide it by 10 and
    // increment the exponent.

    mov x10, 10
    udiv x8, x8, x10
    add x11, x11, 1

    multiply_sign:

    ands xzr, x0, x0
    cneg x0, x8, mi // x0 is signed product coefficient
    b new

    multiply_reduce:

    // The product coefficent is in two words (x9, x8). We need to get it down to
    // one word. Count the number of leading zero bits to get an estimate of the
    // number of excess digits. Then divide.

    clz x5, x9 // x5 is the number of leading zeros
    mov x4, 69 // x4 is 69 (64 + fudge)
    mov x7, 77 // x7 is 77
    sub x4, x4, x5 // x4 is number of excess bits
    mul x4, x4, x7 // x4 is x5 * 77
    lsr x4, x4, 8 // x4 is number of excess digits
    adr x10, power
    ldr x6, [x10, x4, lsl 3]// x6 is a power of ten
    add x11, x11, x4 // pump up the exponent
    clz x7, x6 // count the leading 0 in high dividend
    mov x4, 64
    sub x7, x4, x7 // x7 is sigbits in power of ten

    // x0 is the sign of the product
    // x6 is the power of ten
    // x7 is the sigbits in the power of ten
    // x9, x8 is the oversized product
    // x11 is the exponent

    b divide_big

    multiply_null:

    subs xzr, x0, 0x80
    csel x4, x0, x4, eq
    subs xzr, x1, 0x80
    csel x6, x1, x6, eq
    cbz x4, return_zero
    cbz x6, return_zero
    b return_null

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_divide:
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    // Divide a dec64 number by another.
    // Clobbers x4 thru x11.

    asr x4, x0, 8 // x4 is the dividend coefficient
    cbz x4, divide_zero // is the dividend 0?
    sxtb x5, w0 // x5 is the dividend exponent
    asr x6, x1, 8 // x6 is the divisor coefficient
    sxtb x7, w1 // x7 is the divisor exponent

    // If the divisor is zero, or if either number is nan, the result is null.

    subs xzr, x5, -128
    csetm x9, ne // x9 is 0 if dividend exponent is nan
    subs xzr, x7, -128
    csel x9, x9, xzr, ne // x9 is 0 if divisor exponent is nan
    ands xzr, x6, x6
    csel x9, x9, xzr, ne // x9 is 0 if divisor coefficient is 0
    cbz x9, return_null

    // There are a couple of interesting special cases.

    subs xzr, x1, 0x200 // is the divisor 2?
    b.eq divide_two
    subs xzr, x1, 0x100 // is the divisor 1?
    b.eq return

    // The exponent of the quotient is the difference of the input exponents.

    sub x11, x5, x7 // x11 is the quotient exponent

    // Save the sign of the quotient. We will mostly be using unsigned arithmetic.

    eor x0, x4, x6 // x0 is the sign of the quotient

    // Make the arguments positive.

    ands xzr, x4, x4
    cneg x4, x4, mi // x4 is abs(dividend coefficient)
    ands xzr, x6, x6
    cneg x6, x6, mi // x6 is abs(divisor coefficient)

    // This is a floating point divide, so we want to preserve as much information
    // in the quotient as possible. To do this, we will scale up the dividend by a
    // suitable power of ten, reducing the exponent by a comparable amount.

    clz x5, x4 // x5 is leading zeros of dividend
    mov x9, 64 // x9 is 64
    clz x7, x6 // x7 is leading zeros of divisor
    sub x5, x9, x5 // x5 is sigbits in dividend
    sub x7, x9, x7 // x7 is sigbits in divisor
    add x8, x7, 59 // x8 is sigbits needed in dividend
    sub x8, x8, x5 // x8 is additional sigbits required

    // To convert bits to digits, we multiply by log10/log2 (0.30103), which is
    // almost equal to 77/256 (0.30078).

    mov x10, 77
    mul x8, x8, x10
    lsr x8, x8, 8 // x8 is the number of digits to inflate
    sub x11, x11, x8 // subtract the new digits from the exponent

    // The number of new digits could be as great as 34, but the ARM64 multiplier
    // can only take multipliers as great as 19 digits. So it might be necessary to
    // split the multiplication into two parts.

    // The optional first part produces a product that fits in one word.

    adr x9, power // x9 is the address of the power of 10 table
    subs x10, x8, 19 // is the number of digits more than 19?
    b.le divide_inflate // if not, then a single multiply is needed
    ldr x10, [x9, x10, lsl 3] // x10 is the first power of ten
    mul x4, x4, x10 // multiply the dividend by the first part
    mov x8, 19 // the next multiply will be by 1e19

    divide_inflate:

    // Put the inflated dividend in (x9, x8).

    ldr x10, [x9, x8, lsl 3]// load the power of ten
    mul x8, x4, x10 // x8 is the low half of the dividend
    umulh x9, x4, x10 // x9 is the high half of the dividend

    // Align the dividend and divisor by their leading 1 bits.
    // How we do this depends on the size of the dividend.

    cbnz x9, divide_big

    // If the dividend has only 1 word, then shift the divisor.

    clz x5, x8 // count the leading 0 in low dividend
    mov x4, 64 // x4 is 64
    mov x9, x8 // move the low part to the high part
    sub x5, x4, x5 // x5 is sigbits in dividend
    mov x8, xzr // zero out the low part
    sub x10, x5, x7 // x10 is the countdown
    lsl x6, x6, x10 // align the divisor
    b divide_ready

    divide_big:

    // If the dividend has two words, then shift the dividend.

    clz x5, x9 // count the leading 0 in high dividend
    mov x4, 64 // x4 is 64 (word size)
    sub x5, x4, x5 // x5 is sigbits in high dividend
    sub x10, x7, x5 // x10 is left shift distance
    sub x4, x4, x10 // x4 is right shift distance
    lsl x9, x9, x10 // shift high dividend
    lsr x4, x8, x4 // x4 is the carry
    orr x9, x9, x4 // insert the carry into the high dividend
    lsl x8, x8, x10 // shift the low dividend
    add x10, x5, 64 // x10 is sigbits in whole dividend (x9, x8)
    sub x10, x10, x7 // x10 is the countdown

    divide_ready:

    mov x7, xzr // x7 is the quotient

    divide_step:

    // In each divide step:
    // Double the quotient
    // Find the difference between the aligned dividend and divisor
    // If the difference is not negative
    // Add 1 to the quotient
    // Subtract the divisor from the dividend
    // Double the dividend (x9, x8)
    // Decrement the countdown

    subs x4, x9, x6 // x4 is high dividend - divisor
    cset x5, pl // x5 is 1 if difference is positive
    csel x9, x4, x9, pl // x9 is the difference if positive
    add x7, x5, x7, lsl 1 // double quotient and + 1 if positive diff
    lsr x5, x8, 63 // x5 is carry (high bit of low dividend)
    orr x9, x5, x9, lsl 1 // shift high dividend and insert carry
    lsl x8, x8, 1 // shift low dividend
    subs x10, x10, 1 // decrement countdown
    b.pl divide_step // is it done?

    // Correct the sign and get out.

    ands xzr, x0, x0 // should it negative?
    cneg x0, x7, mi // correct the sign
    b new

    divide_zero:

    // If x0 is not 0x80, return zero.

    subs xzr, x0, 0x80
    b.ne return_zero
    b return_null

    divide_two:

    // Divide a dec64 number by two. If it is even, we can do a shift. If it is odd,
    // then we decrement the exponent and multiply the coefficient by 5.

    tbnz x4, 0, divide_two_odd // the coefficient is odd
    cbz x4, return_zero
    lsl x0, x4, 7
    bfi x0, x5, 0, 8
    ret

    divide_two_odd:

    // Multiply by 5 and divide by 10.

    sub x11, x5, 1 // decrease the exponent
    add x0, x4, x4, lsl 2 // x0 is coefficient * 5
    b new

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_integer_divide:
    // (dividend: dec64, divisor: dec64) returns quotient: dec64

    // Divide, with a floored integer result. It produces the same result as
    // dec64_floor(dec64_divide(dividend, divisor))

    // Clobbers x12. It can also clobber more via dec64_divide and dec64_floor.

    // If either exponent is not zero, or if either coefficient is negative, then do
    // it the hard way.

    orr x12, x0, x1
    ands xzr, x12, 255
    orr x12, x12, x12, asr 63
    cbnz x12, integer_divide_hard

    cbz x0, return
    asr x12, x1, 8 // x12 is the divisor
    cbz x12, return_null // divide by zero

    sdiv x0, x0, x12
    lsl x0, x0, 8
    ret

    integer_divide_hard:

    mov x12, x30 // save the return address in x12
    bl dec64_divide
    mov x30, x12 // restore the return address
    b dec64_floor // tail call

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_modulo:
    // (dividend: dec64, divisor: dec64) returns modulus: dec64

    // Modulo. It produces the same result as
    // dec64_subtract(
    // dividend,
    // dec64_multiply(
    // dec64_floor(
    // dec64_divide(
    // dividend,
    // divisor
    // )
    // ),
    // divisor
    // )
    // )

    asr x4, x0, 8 // x4 is dividend coefficient
    cbz x4, modulo_zero
    asr x6, x1, 8 // x6 is divisor coefficient
    cbz x6, return_null
    mov x14, x0 // x14 is dividend
    mov x15, x30 // x15 is return address
    adr x30, 8
    b dec64_integer_divide
    adr x30, 8
    b dec64_multiply
    mov x1, x0
    mov x0, x14
    mov x30, x15
    b dec64_subtract

    modulo_zero:

    subs xzr, x0, 0x80
    b.ne return_zero
    b return_null

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_signum:
    // (number: dec64) returns signature: dec64

    // If the number is nan, the result is nan.
    // If the number is less than zero, the result is -1.
    // If the number is zero, the result is 0.
    // If the number is greater than zero, the result is 1.

    and x5, x0, 0xFF // x5 is the exponent
    subs xzr, x5, 0x80 // is it nan?
    b.eq return_null
    adds xzr, xzr, x0, asr 8 // is the coefficient zero?
    cset x4, ne // x4 is either 1 or 0
    asr x0, x0, 63 // x0 is either -1 or 0
    orr x0, x0, x4 // x0 is either -1, 0, or 1
    lsl x0, x0, 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_abs:
    // (number: dec64) returns absolution: dec64

    // Find the absolute value of a number. If the number is negative, hand it off
    // to dec64_neg. Otherwise, return the number unless it is nan or zero.

    tbnz x0, 63, dec64_neg
    and x5, x0, 0xFF // x5 is the exponent
    subs xzr, x5, 0x80 // is it nan?
    b.eq return_null
    adds x4, xzr, x0, asr 8 // is the coefficient zero?
    csel x0, x0, x4, ne // x0 is either the number or zero
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_neg:
    // (number: dec64) returns negation: dec64

    // Negate a number.

    sxtb x11, w0 // x11 is the exponent
    subs xzr, x11, -128
    b.eq return_null
    sub x0, xzr, x0, asr 8
    b new

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_normal:
    // (number: dec64) returns normalization: dec64

    // Make the exponent as close to zero as possible without losing any
    // signficance. Usually normalization is not needed since it does not
    // materially change the value of a number.

    // Clobbers: x4 thru x7

    sxtb x5, w0 // x5 is the exponent
    tbnz x0, 7, normal_micro // is the exponent negative?
    cbz x5, return // is the number an integer?
    asr x0, x0, 8 // x0 is the coefficient
    cbz x0, return_zero // is the coefficient zero?
    mov x7, 10

    normal_grow:

    // The exponent is greater than zero. If we subtract 1 from the exponent, we
    // must multiply the coefficient by 10.

    mul x4, x0, x7
    asr x6, x4, 55
    adds xzr, x6, x6, asr 63
    b.ne normal_done
    mov x0, x4
    subs x5, x5, 1
    b.gt normal_grow
    lsl x0, x0, 8
    ret

    normal_micro:

    // The exponent is negative. Is it nan?

    subs xzr, x5, -128
    b.eq return_null
    mov x7, 10
    asr x0, x0, 8 // x0 is the coefficient

    normal_shrink:

    // If we add 1 to the exponent, we must divide the coefficient by 10.

    cbz x0, return_zero // is the coefficient zero?
    sdiv x4, x0, x7
    mul x6, x4, x7
    subs xzr, x6, x0
    b.ne normal_done
    mov x0, x4
    adds x5, x5, 1
    b.mi normal_shrink

    normal_done:

    and x5, x5, 0xFF
    add x0, x5, x0, lsl 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return:

    // Return whatever is in x0.

    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_null:

    mov x0, 0x8000000000000000
    add x0, x0, 0x80
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_one:

    mov x0, 256 // one
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_zero:

    mov x0, xzr // zero
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_equal:
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    // Compare two dec64 numbers.

    subs xzr, x0, x1 // this is the trivial case
    b.eq return_true

    // If the comparator is nan, then we pass the problem to dec64_is_nan.

    and x7, x1, 0xFF // x7 is the second exponent
    subs xzr, x7, 0x80
    b.eq dec64_is_nan

    // If the exponents are equal, then the numbers are not.
    // If the signs are not equal, then the numbers are not.

    eor x4, x0, x1
    asr x4, x4, 63 // x4 is 0 if the signs match
    and x5, x0, 0xFF // x5 is the first exponent
    subs xzr, x5, x7 // do the exponents match?
    csinv x5, x4, xzr, ne
    cbnz x5, return_false

    // Do it the hard way.

    mov x15, x30
    adr x30, is_equal_subtract_return
    b dec64_subtract

    is_equal_subtract_return:

    mov x30, x15
    cbz x0, return_true
    b return_false

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_false:
    // (boolean: dec64) returns notation: dec64

    // If the boolean is false, the result is true. Otherwise the result is false.

    mov x4, 0x8000000000000000 // the most significant bit
    add x4, x4, 0x280 // x4 is false
    add x5, x4, 0x100 // x5 is true
    subs xzr, x0, x4
    csel x0, x4, x5, ne
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_integer:
    // (number: dec64) returns comparison: dec64

    // If the number contains a non-zero fractional part or if it is nan, return
    // false. Otherwise, return true.

    tbz x0, 7, return_true
    sxtb x5, w0 // x5 is the exponent
    sub x5, xzr, x5
    subs xzr, x5, 17
    b.gt return_false
    adr x7, power
    ldr x7, [x7, x5, lsl 3]
    asr x0, x0, 8
    sdiv x4, x0, x7
    mul x4, x4, x7
    subs xzr, x0, x4
    b.eq return_true
    b return_false

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_less:
    // (comparahend: dec64, comparator: dec64) returns comparison: dec64

    // Compare two dec64 numbers. If the first is less than the second, return true,
    // otherwise return false.
    // Clobbers: x4 thru x11

    // The other 3 comparison functions are easily implemented with dec64_is_less:

    // dec64_is_greater(a, b) => dec64_is_less(b, a)
    // dec64_is_greater_or_equal(a, b) => dec64_is_false(dec64_is_less(a, b))
    // dec64_is_less_or_equal(a, b) => dec64_is_false(dec64_is_less(b, a))

    // If the exponents are the same, then do a simple compare. nan is not less than
    // any number. If the exponents are both nan, then the result must be false
    // because nan is not less than nan.

    sxtb x5, w0 // x5 is the first exponent
    sxtb x7, w1 // x7 is the second exponent
    eor x4, x0, x1 // x4 is negative if the signs mismatch
    tbnz x4, 63, less_sign
    subs x9, x5, x7 // x9 is the exponent difference
    b.ne less_hard

    subs xzr, x0, x1 // compare the numbers
    b.ge return_false

    subs xzr, x5, -128 // is the first argument nan?
    b.ne return_true
    b return_false // nan is never less than nan

    less_sign:

    subs xzr, x5, -128 // is the first argument nan?
    b.eq return_false
    subs xzr, x7, -128 // is the second argument nan?
    b.eq return_true
    tbnz x0, 63, return_true
    b return_false

    less_hard:

    subs xzr, x5, -128 // is the first argument nan?
    b.eq return_false
    subs xzr, x7, -128 // is the second argument nan?
    b.eq return_true
    asr x6, x1, 8 // x6 is the second coefficient
    cbz x6, return_false
    adds x4, xzr, x0, asr 8 // x4 is the first coefficient
    b.eq return_true
    cneg x4, x4, mi // x4 is abs(first coefficient)
    cneg x6, x6, mi // x6 is abs(second coefficient)
    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    lsr x8, x1, 63 // x8 is 1 if numbers are negative
    eor x0, x0, x8, lsl 8 // flip x0 if the inputs are negative

    // Swap and flip if the difference of the exponents is negative.

    ands xzr, x9, x9
    b.pl less_hard_scale
    mov x11, x4
    mov x4, x6
    mov x6, x11
    sub x9, xzr, x9
    eor x0, x0, 0x100 // flip x0

    less_hard_scale:

    // The maximum possible coefficient is 36028797018963967. 10**18 is more than
    // that. If the exponential difference is large, then the coefficients do not
    // matter.

    subs xzr, x9, 18
    b.ge return

    // Amplify the first coefficient. This will make their exponents the same,
    // allowing a simple comparison.

    adr x10, power
    ldr x10, [x10, x9, lsl 3] // x10 is a power of ten
    umulh x8, x4, x10 // x8 is the high extension of the product
    cbnz x8, return // if it overflowed, it can not be less
    mul x4, x4, x10 // x4 is the scaled coefficient
    subs xzr, x4, x6 // now we can compare
    b.eq return_false
    b.hi return
    eor x0, x0, 0x100 // flip one more time
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_nan:
    // (number: dec64) returns comparison: dec64

    and x5, x0, 0xFF
    subs xzr, x5, 0x80
    cset x5, eq
    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    add x0, x0, x5, lsl 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_zero:
    // (number: dec64) returns comparison: dec64

    ands x4, x0, 0xFFFFFFFFFFFFFF00
    cset x4, eq
    and x5, x0, 0xFF
    subs xzr, x5, 0x80
    csel x4, xzr, x4, eq
    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    add x0, x0, x4, lsl 8
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_false:

    mov x0, 0x8000000000000000
    add x0, x0, 0x280 // x0 is false
    ret

    // -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_true:

    mov x0, 0x8000000000000000
    add x0, x0, 0x380 // x0 is true
    ret
    33 changes: 33 additions & 0 deletions test_android.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,33 @@
    #!/bin/sh

    # This script builds and runs the DEC64 tests for an ARM64 Android device. It
    # takes no arguments.

    build_and_run() {

    # Compile, assemble and link a source file into a binary executable, then
    # transfer and run that executable on a connected Android device.

    src=$1
    bin=$2

    # The tests shift negative integers on purpose, so we suppress that warning.

    "NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android32-clang" \
    -Wno-shift-negative-value \
    -o $bin \
    dec64_string.c \
    dec64_math.c \
    dec64.s \
    $src \
    && adb push ./$bin /data/local/tmp/$bin \
    && adb shell /data/local/tmp/$bin
    }

    build_and_run dec64_test.c dec64_test \
    && build_and_run dec64_string_test.c dec64_string_test \
    && build_and_run dec64_math_test.c dec64_math_test \
    || exit

    # This script has been tested on a Samsung A52 5G device running Android 12,
    # using the Android NDK r25b.
  6. @jamesdiacono jamesdiacono created this gist Sep 3, 2022.
    1,460 changes: 1,460 additions & 0 deletions dec64.nasm
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,1460 @@
    ; title dec64.nasm for x64.

    ; dec64.com
    ; 2022-09-03
    ; Public Domain

    ; No warranty expressed or implied. Use at your own risk. You have been warned.

    ; This file implements the elementary arithmetic operations for DEC64, a decimal
    ; floating point type. DEC64 uses 64 bits to represent a number. The low order
    ; 8 bits contain an exponent that ranges from -127 to 127. The exponent is not
    ; biased. The exponent -128 is reserved for nan (not a number). The remaining
    ; 56 bits, including the sign bit, are the coefficient in the range
    ; -36028797018963968 thru 36028797018963967. The exponent and the coefficient
    ; are both two's complement signed numbers.
    ;
    ; The value of any non-nan DEC64 number is coefficient * (10 ** exponent).
    ;
    ; Rounding is to the nearest value. Ties are rounded away from zero. Integer
    ; division is floored. The result of modulo has the sign of the divisor.
    ; There is no negative zero.
    ;
    ; These operations will produce a result of nan:
    ;
    ; dec64_abs(nan)
    ; dec64_ceiling(nan)
    ; dec64_floor(nan)
    ; dec64_neg(nan)
    ; dec64_normal(nan)
    ; dec64_signum(nan)
    ;
    ; These operations will produce a result of zero for all values of n,
    ; even if n is nan:
    ;
    ; dec64_divide(0, n)
    ; dec64_integer_divide(0, n)
    ; dec64_modulo(0, n)
    ; dec64_multiply(0, n)
    ; dec64_multiply(n, 0)
    ;
    ; These operations will produce a result of nan for all values of n
    ; except zero:
    ;
    ; dec64_divide(n, 0)
    ; dec64_divide(n, nan)
    ; dec64_integer_divide(n, 0)
    ; dec64_integer_divide(n, nan)
    ; dec64_modulo(n, 0)
    ; dec64_modulo(n, nan)
    ; dec64_multiply(n, nan)
    ; dec64_multiply(nan, n)
    ;
    ; These operations will produce a result of nan for all values of n:
    ;
    ; dec64_add(n, nan)
    ; dec64_add(nan, n)
    ; dec64_divide(nan, n)
    ; dec64_integer_divide(nan, n)
    ; dec64_modulo(nan, n)
    ; dec64_round(nan, n)
    ; dec64_subtract(n, nan)
    ; dec64_subtract(nan, n)
    ;
    ; This file can be processed with Microsoft's ML64.exe. There might be other
    ; assemblers that can process this file, but that has not been tested.
    ;
    ; You know what goes great with those defective pentium chips?
    ; Defective pentium salsa! (David Letterman)

    ; All public symbols have a dec64_ prefix. All other symbols are private.

    ; There are 72057594037927936 possible nan values. Three are reserved:
    ;
    ; DEC64_NULL
    ; DEC64_TRUE
    ; DEC64_FALSE
    ;
    ; The other 72057594037927933 nan values can be used for any purpose,
    ; including object pointers.

    ; When these functions return nan, they will always return DEC64_NULL,
    ; the normal nan.

    ; The comparison functions will return DEC64_TRUE or DEC64_FALSE.

    %define null 8000000000000080h
    %define true 8000000000000380h
    %define false 8000000000000280h

    ; Multiplication by a scaled reciprocal can be faster than division.

    %define eight_over_ten -3689348814741910323

    global dec64_abs;(number: dec64)
    ; returns absolution: dec64

    global dec64_add;(augend: dec64, addend: dec64)
    ; returns sum: dec64

    global dec64_ceiling;(number: dec64)
    ; returns integer: dec64

    global dec64_coefficient;(number: dec64)
    ; returns coefficient: int64

    global dec64_divide;(dividend: dec64, divisor: dec64)
    ; returns quotient: dec64

    global dec64_exponent;(number: dec64)
    ; returns exponent: int64

    global dec64_floor;(number: dec64)
    ; returns integer: dec64

    global dec64_integer_divide;(dividend: dec64, divisor: dec64)
    ; returns quotient: dec64

    global dec64_is_equal;(comparahend: dec64, comparator: dec64)
    ; returns comparison: dec64

    global dec64_is_false;(boolean: dec64)
    ; returns comparison: dec64

    global dec64_is_integer;(number: dec64)
    ; returns comparison: dec64

    global dec64_is_less;(comparahend: dec64, comparator: dec64)
    ; returns comparison: dec64

    global dec64_is_nan;(number: dec64)
    ; returns comparison: dec64

    global dec64_is_zero;(number: dec64)
    ; returns comparison: dec64

    global dec64_modulo;(dividend: dec64, divisor: dec64)
    ; returns modulus: dec64

    global dec64_multiply;(multiplicand: dec64, multiplier: dec64)
    ; returns product: dec64

    global dec64_neg;(number: dec64)
    ; returns negation: dec64

    global dec64_new;(coefficient: int64, exponent: int64)
    ; returns number: dec64

    global dec64_normal;(number: dec64)
    ; returns normalization: dec64

    global dec64_round;(number: dec64, place: dec64)
    ; returns quantization: dec64

    global dec64_signum;(number: dec64)
    ; returns signature: dec64

    global dec64_subtract;(minuend: dec64, subtrahend: dec64)
    ; returns difference: dec64

    ; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    ; Repair the register names. Over the long and twisted evolution of x86, the
    ; register names have picked up some weird, inconsistent conventions. We can
    ; simplify, naming them r0 thru r15, with a _b suffix to indicate the low
    ; order byte, an _h suffix to indicate the so-called high byte and a _w suffix
    ; to indicate the lower 16 bits. We would use _d to indicate the lower 32 bits,
    ; but that was not needed here.

    %define r0 rax
    %define r1 rcx
    %define r2 rdx
    %define r6 rsi
    %define r7 rdi

    %define r0_b al
    %define r1_b cl
    %define r2_b dl
    %define r8_b r8b
    %define r9_b r9b
    %define r10_b r10b
    %define r11_b r11b

    %define r0_h ah
    %define r1_h ch
    %define r2_h dh

    %define r0_w ax
    %define r1_w cx

    %define jr1z jrcxz

    ; All of the public functions in this file accept up to two arguments, which
    ; are passed in registers (either r1, r2 or r7, r6), returning a result in r0.

    ; Registers r1, r2, r8, r9, r10, and r11 are clobbered. Register r0 is the
    ; return value. The other registers are not disturbed.

    ; There is painfully inadequate standardization around x64 calling conventions.
    ; On Win64, the first two arguments are passed in r1 and r2. On Unix, the first
    ; two arguments are passed in r7 and r6. We try to hide this behind macros. The
    ; two systems also have different conventions about which registers may be
    ; clobbered and which must be preserved. This code lives in the intersection.

    ; This has not yet been tested on Unix.

    %define UNIX 1 ; calling convention: 0 for Windows, 1 for Unix

    %macro function_with_one_parameter 0
    %if UNIX
    mov r1, r7 ;; UNIX
    %endif
    %endmacro

    %macro function_with_two_parameters 0
    %if UNIX
    mov r1, r7 ;; UNIX
    mov r2, r6 ;; UNIX
    %endif
    %endmacro

    %macro call_with_one_parameter 1
    %if UNIX
    mov r7, r1 ;; UNIX
    %endif
    call %1
    %endmacro

    %macro call_with_two_parameters 1
    %if UNIX
    mov r7, r1 ;; UNIX
    mov r6, r2 ;; UNIX
    %endif
    call %1
    %endmacro

    %macro tail_with_one_parameter 1
    %if UNIX
    mov r7, r1 ;; UNIX
    %endif
    jmp %1
    %endmacro

    %macro tail_with_two_parameters 1
    %if UNIX
    mov r7, r1 ;; UNIX
    mov r6, r2 ;; UNIX
    %endif
    jmp %1
    %endmacro

    ; There may be a performance benefit in padding programs so that most jump
    ; destinations are aligned on 16 byte boundaries.

    %macro pad 0
    align 16
    %endmacro

    ; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_code section .text

    power: ; the powers of 10

    dq 1 ; 0
    dq 10 ; 1
    dq 100 ; 2
    dq 1000 ; 3
    dq 10000 ; 4
    dq 100000 ; 5
    dq 1000000 ; 6
    dq 10000000 ; 7
    dq 100000000 ; 8
    dq 1000000000 ; 9
    dq 10000000000 ; 10
    dq 100000000000 ; 11
    dq 1000000000000 ; 12
    dq 10000000000000 ; 13
    dq 100000000000000 ; 14
    dq 1000000000000000 ; 15
    dq 10000000000000000 ; 16
    dq 100000000000000000 ; 17
    dq 1000000000000000000 ; 18
    dq 10000000000000000000 ; 19

    ; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_coefficient: function_with_one_parameter
    ;(number: dec64) returns coefficient: int64

    ; Return the coefficient part from a dec64 number.

    mov r0, r1
    sar r0, 8
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_exponent: function_with_one_parameter
    ;(number: dec64) returns exponent: int64

    ; Return the exponent part, sign extended to 64 bits, from a dec64 number.
    ; dec64_exponent(nan) returns -128.

    movsx r0, r1_b ; r0 is the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_new: function_with_two_parameters
    ;(coefficient: int64, exponent: int64) returns number: dec64

    ; Construct a new dec64 number with a coefficient and an exponent.

    mov r0, r1 ; r0 is the coefficient
    test r0, r0 ; is the coefficient zero?
    jz return_zero
    mov r8, r2 ; r8 is the exponent

    ; Fall into pack.

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    pack:

    ; The pack function will combine the coefficient and exponent into a dec64.
    ; Numbers that are too huge to be contained in this format become nan.
    ; Numbers that are too tiny to be contained in this format become zero.

    ; The coefficient is in r0.
    ; The exponent is in r8.

    ; If the exponent is greater than 127, then the number is too big.
    ; But it might still be possible to salvage a value.

    cmp r8, 127 ; compare exponent with 127
    jg pack_decrease ; it might be possible to decrease it

    ; If the exponent is too small, or if the coefficient is too large, then some
    ; division is necessary. The absolute value of the coefficient is off by one
    ; for the negative because
    ; negative_extreme_coefficent = -(extreme_coefficent + 1)

    mov r10, r0 ; r10 is the coefficient
    mov r1, 3602879701896396800 ; the ultimate coefficient * 100
    not r10 ; r10 is -coefficient
    xor r11, r11 ; r11 is zero
    test r10, r10 ; look at the sign bit
    cmovs r10, r0 ; r10 is the absolute value of coefficient
    cmp r10, r1 ; compare with the actual coefficient
    jae pack_large ; deal with the very large coefficient
    mov r1, 36028797018963967 ; the ultimate coefficient - 1
    mov r9, -127 ; r9 is the ultimate exponent
    cmp r1, r10 ; compare with the actual coefficient
    adc r11, 0 ; add 1 to r11 if 1 digit too big
    mov r1, 360287970189639679 ; the ultimate coefficient * 10 - 1
    sub r9, r8 ; r9 is the difference from actual exponent
    cmp r1, r10 ; compare with the actual coefficient
    adc r11, 0 ; add 1 to r11 if 2 digits too big
    cmp r9, r11 ; which excess is larger?
    cmovl r9, r11 ; take the max
    test r9, r9 ; if neither was zero
    jnz pack_increase ; then increase the exponent by the excess
    shl r0, 8 ; shift the exponent into position
    cmovz r8, r0 ; if coefficient is zero, also zero the exp
    movzx r8, r8_b ; zero out all but the bottom 8 bits of exp
    or r0, r8 ; mix in the exponent
    ret ; whew
    pad

    pack_large:

    mov r1, r0 ; r1 is the coefficient
    sar r1, 63 ; r1 is -1 if negative, or 0 if positive
    mov r11, eight_over_ten ; magic number
    mov r9, r1 ; r9 is -1 or 0
    xor r0, r1 ; complement the coefficient if negative
    and r9, 1 ; r9 is 1 or 0
    add r0, r9 ; r0 is absolute value of coefficient
    add r8, 1 ; add 1 to the exponent
    mul r11 ; multiply abs(coefficient) by magic number
    mov r0, r2 ; r0 is the product shift 64 bits
    shr r0, 3 ; r0 is divided by 8: abs(coefficient) / 10
    xor r0, r1 ; complement coefficient if it was negative
    add r0, r9 ; coefficient's sign is restored
    jmp pack ; start over
    pad

    pack_increase:

    mov r10, power
    mov r10, [r10+r9*8] ; r10 is 10^r9
    cmp r9, 20 ; is the difference more than 20?
    jae return_zero ; if so, the result is zero (rare)
    mov r11, r10 ; r11 is the power of ten
    neg r11 ; r11 is the negation of the power of ten
    test r0, r0 ; examine the sign of the coefficient
    cmovns r11, r10 ; r11 has the sign of the coefficient
    sar r11, 1 ; r11 is half the signed power of ten
    add r0, r11 ; r0 is coefficient plus the rounding fudge
    cqo ; sign extend r0 into r2
    idiv r10 ; divide by the power of ten
    add r8, r9 ; increase the exponent
    jmp pack ; start over
    pad

    pack_decrease:

    ; The exponent is too big. We can attempt to reduce it by scaling back.
    ; This can salvage values in a small set of cases.

    imul r0, 10 ; try multiplying the coefficient by 10
    jo return_null ; if it overflows, we failed to salvage
    sub r8, 1 ; decrement the exponent
    test r8, -128 ; is the exponent still over 127?
    jnz pack_decrease ; until the exponent is less than 127
    mov r9, r0 ; r9 is the coefficient
    sar r9, 56 ; r9 is top 8 bits of the coefficient
    adc r9, 0 ; add the ninth bit
    jnz return_null ; the number is still too large
    shl r0, 8 ; shift the exponent into position
    cmovz r8, r0 ; if coefficient is zero, also zero exponent
    movzx r8, r8_b ; zero out all but bottom 8 bits of exponent
    or r0, r8 ; mix in the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_round: function_with_two_parameters
    ;(number: dec64, place: dec64) returns quantization: dec64

    ; The place argument indicates at what decimal place to round.
    ; -2 nearest cent
    ; 0 nearest integer
    ; 3 nearest thousand
    ; 6 nearest million
    ; 9 nearest billion

    ; The place should be between -16 and 16.

    pad

    round_begin:

    cmp r1_b, 128 ; is the number nan?
    jz return_null
    test r2_b, r2_b ; is places already an integer?
    jnz round_places ; nope
    mov r10, r1 ; r10 is the number
    mov r11, r2 ; r11 is the places
    movsx r8, r10_b ; r8 is the current exponent
    mov r0, r10 ; r0 is the coefficient
    sar r11, 8 ; r11 is place: int64
    sar r0, 8 ; r0 is the coefficient of the number
    jz return_zero ; if the coefficient is 0, the result is 0
    cmp r8, r11 ; compare the exponents
    jge pack ; no rounding required
    mov r1, r0
    mov r9, eight_over_ten ; magic
    neg r1 ; r1 is -coefficient
    cmovns r0, r1 ; r0 is abs(coefficient)
    pad

    round_loop:

    ; Increment the exponent and divide the coefficient by 10 until the target
    ; exponent is reached. The division is accomplished by multiplying with a
    ; scaled reciprocal.

    mul r9 ; r2 is the coefficient * 8 / 10
    mov r0, r2 ; r0 is the coefficient * 8 / 10
    sar r0, 3 ; r0 is the coefficient / 10
    add r8, 1 ; increment the exponent
    cmp r8, r11 ; compare the exponent with the target
    jne round_loop ; loop if the exponent is not at the target

    ; Round if necessary and return the result.

    shr r2, 2 ; isolate the carry bit
    and r2, 1 ; r2 is 1 if rounding is needed
    add r0, r2 ; r0 is rounded
    mov r1, r0
    neg r1
    test r10, r10 ; was the original number negative?
    cmovs r0, r1
    jmp pack ; pack it up
    pad

    round_places:

    ; Places does not seem to be an integer. If it is nan, then default to zero.

    cmp r2_b, 128
    jne round_normal
    xor r2, r2
    jmp round_begin
    pad

    round_normal:

    mov r10, r1 ; save the number
    mov r1, r2 ; pass the place
    call_with_one_parameter dec64_normal
    test r0_b, r0_b ; is the number of places a normal integer?
    jnz return_null
    mov r1, r10
    mov r2, r0
    jmp round_begin

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_add: function_with_two_parameters
    ;(augend: dec64, addend: dec64) returns sum: dec64

    ; Add two dec64 numbers together.

    ; If the two exponents are both zero (which is usually the case for integers)
    ; we can take the fast path. Since the exponents are both zero, we can simply
    ; add the numbers together and check for overflow.

    pad

    add_begin:

    mov r0, r1 ; r0 is the first number
    or r1_b, r2_b ; r1_b is the two exponents or'd together
    jnz add_slow ; if either exponent is not zero, slow path
    add r0, r2 ; add the shifted coefficients together
    jo add_overflow ; if there was no overflow, we are done
    ret ; no need to pack
    pad

    add_overflow:

    ; If there was an overflow (extremely unlikely) then we must make it fit.
    ; pack knows how to do that.

    rcr r0, 1 ; divide the sum by 2 and repair its sign
    movsx r8, r1_b ; r8 is the exponent
    sar r0, 7 ; r0 is the coefficient of the sum
    jmp pack ; pack it up
    pad

    add_slow:

    ; The slow path is taken if the two operands do not both have zero exponents.

    mov r1, r0 ; restore r1
    cmp r0_b, 128 ; is the first operand nan?
    je return_null ; if nan, get out

    ; Are the two exponents the same? This will happen often, especially with
    ; money values.

    cmp r1_b, r2_b ; compare the two exponents
    jne add_slower ; if not equal, take the slower path

    ; The exponents match so we may add now. Zero out the exponents so there
    ; will be no carry into the coefficients when the coefficients are added.
    ; If the result is zero, then return the normal zero.

    and r0, -256 ; remove the exponent
    and r2, -256 ; remove the other exponent
    add r0, r2 ; add the shifted coefficients
    jo add_overflow ; if it overflows, it must be repaired
    cmovz r1, r0 ; if coefficient is zero, exponent is zero
    movzx r1, r1_b ; mask the exponent
    or r0, r1 ; mix in the exponent
    ret ; no need to pack
    pad

    add_slower:

    ; The slower path is taken when neither operand is nan, and their
    ; exponents are different. Before addition can take place, the exponents
    ; must be made to match. Swap the numbers if the second exponent is greater
    ; than the first.

    cmp r2_b, 128 ; Is the second operand nan?
    je return_null
    cmp r1_b, r2_b ; compare the exponents
    mov r0, r1 ; r0 is the first number
    cmovl r1, r2 ; r1 is the number with the larger exponent
    cmovl r2, r0 ; r2 is the number with the smaller exponent

    ; Shift the coefficients of r1 and r2 into r10 and r11 and unpack the exponents.

    mov r10, r1 ; r10 is the first number
    mov r11, r2 ; r11 is the second number
    movsx r8, r1_b ; r8 is the first exponent
    movsx r9, r2_b ; r9 is the second exponent
    sar r10, 8 ; r10 is the first coefficient
    sar r11, 8 ; r11 is the second coefficient
    mov r0, r10 ; r0 is the first coefficient
    pad

    add_slower_decrease:

    ; The coefficients are not the same. Before we can add, they must be the same.
    ; We will try to decrease the first exponent. When we decrease the exponent
    ; by 1, we must also multiply the coefficient by 10. We can do this as long as
    ; there is no overflow. We have 8 extra bits to work with, so we can do this
    ; at least twice, possibly more.

    imul r0, 10 ; before decrementing the exponent, multiply
    jo add_slower_increase
    sub r8, 1 ; decrease the first exponent
    mov r10, r0 ; r10 is the enlarged first coefficient
    cmp r8, r9 ; are the exponents equal yet?
    jg add_slower_decrease
    mov r0, r11 ; r0 is the second coefficient

    ; The exponents are now equal, so the coefficients may be added.

    add r0, r10 ; add the two coefficients
    jmp pack ; pack it up
    pad

    add_slower_increase:

    ; We cannot decrease the first exponent any more, so we must instead try to
    ; increase the second exponent, which will result in a loss of significance.
    ; That is the heartbreak of floating point.

    ; Determine how many places need to be shifted. If it is more than 17, there is
    ; nothing more to add.

    mov r2, r8 ; r2 is the first exponent
    sub r2, r9 ; r2 is the remaining exponent difference
    mov r0, r11 ; r0 is the second coefficient
    cmp r2, 17 ; 17 is the max digits in a packed coefficient
    ja return_r1 ; too small to matter
    mov r9, power
    mov r9, [r9+r2*8] ; r9 is the power of ten
    cqo ; sign extend r0 into r2
    idiv r9 ; divide second coefficient by power of 10
    test r0, r0 ; examine the scaled coefficient
    jz return_r1 ; too insignificant to add?

    ; The exponents are now equal, so the coefficients may be added.

    add r0, r10 ; add the two coefficients
    jmp pack
    pad

    return_r1:

    mov r0, r1 ; r0 is the original number
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_ceiling: function_with_one_parameter
    ;(number: dec64) returns integer: dec64

    ; Produce the smallest integer that is greater than or equal to the number. In
    ; the result, the exponent will be greater than or equal to zero unless it is
    ; nan. Numbers with positive exponents will not be modified, even if the
    ; numbers are outside of the safe integer range.

    ; Preserved: r11.

    mov r9, 1 ; r9 is the round up flag
    jmp floor_begin

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_floor: function_with_one_parameter
    ;(number: dec64) returns integer: dec64

    ; Produce the largest integer that is less than or equal to the number. This
    ; is sometimes called the entier function. In the result, the exponent will be
    ; greater than or equal to zero unless it is nan. Numbers with positive
    ; exponents will not be modified, even if the numbers are outside of the safe
    ; integer range.

    ; Preserved: r11.

    mov r9, -1 ; r9 is the round down flag
    pad

    floor_begin:

    cmp r1_b, 128
    je return_null
    mov r0, r1 ; r0 is the number
    movsx r8, r1_b ; r8 is the exponent
    sar r0, 8 ; r0 is the coefficient
    cmovz r1, r0 ; if coefficient is zero, the number is zero
    neg r8 ; r8 is the negated exponent
    test r1_b, r1_b ; examine the exponent
    jns return_r1 ; nothing to do unless exponent was negative
    cmp r8, 17 ; is the exponent is too extreme?
    jae floor_micro ; deal with a micro number
    mov r10, power
    mov r10, [r10+r8*8] ; r10 is the power of ten
    cqo ; sign extend r0 into r2
    idiv r10 ; divide r2:r0 by 10
    test r2, r2 ; examine the remainder
    jnz floor_remains ; deal with the remainder
    shl r0, 8 ; pack the coefficient
    ret
    pad

    floor_micro:

    mov r2, r0 ; r2 is the coefficient
    xor r0, r0 ; r0 is zero
    pad

    floor_remains:

    ; If the remainder is negative and the rounding flag is negative, then we need
    ; to decrement r0. But if the remainder and the rounding flag are both
    ; positive, then we need to increment r0.

    xor r10, r10 ; r10 is zero
    xor r2, r9 ; xor the remainder and the rounding
    cmovs r9, r10 ; if different signs, clear the rounding
    add r0, r9 ; add the rounding to the coefficient
    shl r0, 8 ; pack the coefficient
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_subtract: function_with_two_parameters
    ;(minuend: dec64, subtrahend: dec64) returns difference: dec64

    ; Subtract the dec64 number in r2 from the dec64 number in r1.
    ; The result is in r0.

    ; This is the same as dec64_add, except that the operand in r2 has its
    ; coefficient complemented first.

    xor r2, -256 ; not the coefficient only
    add r2, 256 ; two's complement increment of coefficient
    jno add_begin ; if there is no overflow, begin the beguine

    ; The subtrahend coefficient is -36028797018963968. This value cannot easily be
    ; complemented, so take the slower path. This should be extremely rare.

    cmp r1_b, 128 ; is the first operand nan
    sete r0_b
    cmp r2_b, 128 ; is the second operand nan?
    sete r0_h
    or r0_b, r0_h ; is either nan?
    jnz return_null
    mov r10, r1 ; r10 is the first coefficient
    movsx r8, r1_b ; r8 is the first exponent
    sar r10, 8
    movsx r9, r2_b ; r9 is the second exponent
    mov r11, 80000000000000H ; r11 is 36028797018963968
    mov r0, r10 ; r0 is the first coefficient
    cmp r8, r9 ; if the second exponent is larger, swap
    jge subtract_slower_decrease_compare
    mov r0, r11 ; r0 is the second coefficient
    xchg r8, r9 ; swap the exponents
    xchg r10, r11 ; swap the coefficients
    jmp subtract_slower_decrease_compare
    pad

    subtract_slower_decrease:

    ; The coefficients are not the same. Before we can add, they must be the same.
    ; We will try to decrease the first exponent. When we decrease the exponent
    ; by 1, we must also multiply the coefficient by 10. We can do this as long as
    ; there is no overflow. We have 8 extra bits to work with, so we can do this
    ; at least twice, possibly more.

    imul r0, 10 ; before decrementing the exponent, multiply
    jo subtract_slower_increase
    sub r8, 1 ; decrease the first exponent
    mov r10, r0 ; r10 is the enlarged first coefficient
    pad

    subtract_slower_decrease_compare:

    cmp r8, r9 ; are the exponents equal yet?
    jg subtract_slower_decrease
    mov r0, r11 ; r0 is the second coefficient

    ; The exponents are now equal, so the coefficients may be added.

    add r0, r10 ; add the two coefficients
    jmp pack ; pack it up
    pad

    subtract_slower_increase:

    ; We cannot decrease the first exponent any more, so we must instead try to
    ; increase the second exponent, which will result in a loss of significance.
    ; That is the heartbreak of floating point.

    ; Determine how many places need to be shifted. If it is more than 17, there is
    ; nothing more to add.

    mov r2, r8 ; r2 is the first exponent
    sub r2, r9 ; r2 is the remaining exponent difference
    mov r0, r11 ; r0 is the second coefficient
    cmp r2, 17 ; 17 is the max digits in packed coefficient
    ja subtract_underflow ; too small to matter
    mov r9, power
    mov r9, [r9+r2*8] ; r9 is the power of ten
    cqo ; sign extend r0 into r2
    idiv r9 ; divide second coefficient by power of 10

    ; The exponents are now equal, so the coefficients may be added.

    add r0, r10 ; add the two coefficients
    jmp pack
    pad

    subtract_underflow:

    mov r0, r10 ; r0 is the first coefficient
    jmp pack

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_multiply: function_with_two_parameters
    ;(multiplicand: dec64, multiplier: dec64) returns product: dec64

    ; Multiply two dec64 numbers together.

    ; Unpack the exponents into r8 and r9.

    movsx r8, r1_b ; r8 is the first exponent
    movsx r9, r2_b ; r9 is the second exponent

    ; Set flags in r0 if either operand is nan.

    cmp r1_b, 128 ; is the first operand nan?
    sete r0_b ; r0_b is 1 if the first operand is nan
    cmp r2_b, 128 ; is the second operand nan?
    sete r0_h ; r0_h is 1 if the second operand is nan

    ; Unpack the coefficients. Set flags in r1 if either is not zero.

    sar r1, 8 ; r1 is the first coefficient
    mov r10, r1 ; r10 is the first coefficient
    setnz r1_b ; r1_b is 1 if first coefficient is not zero
    sar r2, 8 ; r2 is the second coefficient
    setnz r1_h ; r1_h is 1 if second coefficient is not 0

    ; The result is nan if one or both of the operands is nan and neither of the
    ; operands is zero.

    or r1_w, r0_w ; is either coefficient zero and not nan?
    xchg r1_b, r1_h
    test r0_w, r1_w
    jnz return_null

    mov r0, r10 ; r0 is the first coefficient
    add r8, r9 ; r8 is the product exponent
    imul r2 ; r2:r0 is r1 * r2
    jno pack ; if product fits in 64 bits, start packing

    ; There was overflow.

    ; Make the 110 bit coefficient in r2:r0Er8 all fit. Estimate the number of
    ; digits of excess, and increase the exponent by that many digits.
    ; We use 77/256 to convert log2 to log10.

    mov r9, r2 ; r9 is the excess significance
    xor r1, r1 ; r1 is zero anticipating bsr
    neg r9
    cmovs r9, r2 ; r9 is abs of excess significance

    bsr r1, r9 ; find the position of most significant bit
    imul r1, 77 ; multiply the bit number by 77/256 to
    shr r1, 8 ; convert a bit number to a digit number
    add r1, 2 ; add two extra digits to the scale
    add r8, r1 ; increase the exponent
    mov r9, power
    idiv qword [r9+r1*8] ; divide by the power of ten
    jmp pack

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_divide: function_with_two_parameters
    ;(dividend: dec64, divisor: dec64) returns quotient: dec64

    ; Divide a dec64 number by another.

    ; Begin unpacking the components.

    cmp r2, 200h
    jz divide_two
    movsx r8, r1_b ; r8 is the first exponent
    movsx r9, r2_b ; r9 is the second exponent
    mov r10, r1 ; r10 is the first number
    mov r11, r2 ; r11 is the second number

    ; Set nan flags in r0.

    cmp r1_b, 128 ; is the first operand nan?
    sete r0_b ; r0_b is 1 if the first operand is nan
    cmp r2_b, 128 ; is the second operand nan?
    sete r0_h ; r0_h is 1 if the second operand is nan

    sar r10, 8 ; r10 is the dividend coefficient
    setnz r1_b ; r1_b is 1 if dividend coefficient is not 0
    or r1_b, r0_b ; r0_b is 1 if either is nan or divisor is 0
    jz return_zero ; if the dividend is zero, quotient is zero
    sar r11, 8 ; r11 is the divisor coefficient
    setz r1_b ; r1_h is 1 if dividing by zero
    or r0_h, r0_b ; r0_h is 1 if either is nan
    or r1_b, r0_h ; r1_b is zero if dividend is zero & not nan
    jnz return_null
    sub r8, r9 ; r8 is the quotient exponent
    pad

    divide_measure:

    ; We want to get as many bits into the quotient as possible in order to capture
    ; enough significance. But if the quotient has more than 64 bits, then there
    ; will be a hardware fault. To avoid that, we compare the magnitudes of the
    ; dividend coefficient and divisor coefficient, and use that to scale the
    ; dividend to give us a good quotient.

    mov r0, r10 ; r0 is the first coefficient
    mov r1, r11 ; r1 is the second coefficient
    neg r0 ; r0 is negated
    cmovs r0, r10 ; r0 is abs of dividend coefficient
    neg r1 ; r1 is negated
    cmovs r1, r11 ; r1 is abs of divisor coefficient
    bsr r0, r0 ; r0 is the dividend most significant bit
    bsr r1, r1 ; r1 is the divisor most significant bit

    ; Scale up the dividend to be approximately 58 bits longer than the divisor.
    ; Scaling uses factors of 10, so we must convert from a bit count to a digit
    ; count by multiplication by 77/256 (approximately LN2/LN10).

    add r1, 58 ; we want approx 58 bits in raw quotient
    sub r1, r0 ; r1 is the number of bits to add to the dividend
    imul r1, 77 ; multiply by 77/256 to convert bits|digits
    shr r1, 8 ; r1 is number of digits to scale dividend

    ; The largest power of 10 that can be held in an int64 is 1e18.

    cmp r1, 18 ; prescale the dividend if 10**r1 won't fit
    jg divide_prescale

    ; Multiply the dividend by the scale factor, and divide that 128 bit result by
    ; the divisor. Because of the scaling, the quotient is guaranteed to use most
    ; of the 64 bits in r0, and never more. Reduce the final exponent by the number
    ; of digits scaled.

    mov r0, r10 ; r0 is the dividend coefficient
    mov r9, power
    imul qword [r9+r1*8] ; r2:r0 is the dividend coefficient * 10**r1
    idiv r11 ; r0 is the quotient
    sub r8, r1 ; r8 is the exponent
    jmp pack ; pack it up
    pad

    divide_prescale:

    ; If the number of scaling digits is larger than 18, then we will have to
    ; scale in two steps: first prescaling the dividend to fill a register, and
    ; then repeating to fill a second register. This happens when the divisor
    ; coefficient is much larger than the dividend coefficient.

    mov r1, 58 ; we want 58 bits or so in the dividend
    sub r1, r0 ; r1 is the number of additional bits needed
    imul r1, 77 ; convert bits to digits
    shr r1, 8 ; shift 8 is cheaper than div 256
    mov r9, power
    imul r10, qword [r9+r1*8] ; multiply the dividend by power of ten
    sub r8, r1 ; reduce the exponent
    jmp divide_measure ; try again

    divide_two:

    ; Divide a dec64 number by two.

    cmp r1_b, 128
    je return_null
    test r1_h, 1
    jz divide_half

    ; Unpack the components into r8 and r1, multiply by 5 and divide by 10.

    movsx r8, r1_b ; r8 is the exponent
    sar r1, 8 ; r1 is the coefficient
    sub r8, 1 ; bump down the exponent
    lea r0, [r1+r1*4] ; r0 is the coefficient * 5
    jmp pack
    pad

    divide_half:

    ; If the least significant bit of the coefficient is 0, then we can do this
    ; the fast way. Shift the coefficient by 1 bit and restore the exponent. If
    ; the shift produces zero, even easier.

    mov r0, -256 ; r0 is the coefficient mask
    and r0, r1 ; r0 is the coefficient shifted 8 bits
    jz return
    sar r0, 1 ; r0 is divided by 2
    movzx r1, r1_b ; zero out r1 except lowest 8 bits
    or r0, r1 ; mix in the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_integer_divide: function_with_two_parameters
    ;(dividend: dec64, divisor: dec64) returns quotient: dec64

    ; Divide, with a floored integer result. It produces the same result as
    ; dec64_floor(dec64_divide(dividend, divisor))
    ; but can sometimes produce that result more quickly.

    cmp r1_b, r2_b ; are the exponents equal?
    jne integer_divide_slow

    mov r0, r1 ; r0 is the dividend
    mov r11, r2 ; r11 is the divisor
    sar r1, 8 ; r1 is the dividend coefficient
    setnz r2_h ; r2_h is 1 if dividend coefficient is not 0
    cmp r0_b, 128 ; are the operands nan?
    sete r2_b ; r2_b is 1 if the operands are nan
    and r0, -256 ; zero the dividend's exponent
    or r2_h, r2_b ; r2_h is zero if dividend is 0 and not nan
    jz return_zero ; the quotient is zero if dividend is zero
    sar r11, 8 ; r11 is the divisor coefficient
    setz r2_h ; r2_h is 1 if divisor coefficient is zero
    or r2_b, r2_h ; r2_b is 1 if the result is nan
    jnz return_null
    cqo ; sign extend r0 into r2
    idiv r11 ; r0 is the quotient
    and r0, -256 ; zero the exponent again
    ret ; no need to pack
    pad

    integer_divide_slow:

    ; The exponents are not the same, so do it the hard way.

    call_with_two_parameters dec64_divide
    mov r1, r0
    tail_with_one_parameter dec64_floor

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_modulo: function_with_two_parameters
    ;(dividend: dec64, divisor: dec64) returns modulus: dec64

    ; Modulo. It produces the same result as
    ; dec64_subtract(
    ; dividend,
    ; dec64_multiply(
    ; dec64_integer_divide(dividend, divisor),
    ; divisor
    ; )
    ; )

    cmp r1_b, r2_b ; are the two exponents the same?
    jnz modulo_slow ; if not take the slow path
    mov r0, r1 ; r0 is the dividend
    mov r11, r2 ; r11 is the divisor
    cmp r1_b, 128 ; is the first operand nan?
    setne r2_h ; r2_h is 1 if the operands are not nan
    sar r0, 8 ; r0 is the dividend coefficient
    setz r2_b ; r2_b is 1 if dividend coefficient is zero
    test r2_b, r2_h ; is the dividend is zero and not nan?
    jnz return_zero ; quotient is zero if the dividend is zero
    sar r11, 8 ; r11 is the divisor coefficient
    setnz r2_b ; r2_b is 1 if divisor coefficient is not 0
    test r2_h, r2_b ; is either operand nan or is divisor zero?
    jz return_null
    cqo ; sign extend r0 into r2
    idiv r11 ; divide r2:r0 by the divisor

    ; If the signs of the divisor and remainder are different and the remainder is
    ; not zero, add the divisor to the remainder.

    xor r0, r0 ; r0 is zero
    mov r10, r2 ; r10 is the remainder
    test r2, r2 ; examine the remainder
    cmovz r11, r0 ; r11 is zero if the remainder is zero
    xor r10, r11 ; r10 is remainder xor divisor
    cmovs r0, r11 ; r0 is divisor if the signs were different
    add r0, r2 ; r0 is the corrected result
    cmovz r1, r0 ; if r0 is zero, so is the exponent
    shl r0, 8 ; position the coefficient
    mov r0_b, r1_b ; mix in the exponent
    ret
    pad

    modulo_slow:

    ; The exponents are not the same, so do it the hard way.

    push r1 ; save the dividend
    push r2 ; save the divisor

    call_with_two_parameters dec64_integer_divide
    pop r2
    mov r1, r0
    call_with_two_parameters dec64_multiply
    pop r1
    mov r2, r0
    tail_with_two_parameters dec64_subtract

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_signum: function_with_one_parameter
    ;(number: dec64) returns signature: dec64

    ; If the number is nan, the result is nan.
    ; If the number is less than zero, the result is -1.
    ; If the number is zero, the result is 0.
    ; If the number is greater than zero, the result is 1.

    cmp r1_b, 128
    jz return_null
    mov r0, r1 ; r0 is the number
    sar r0, 8 ; r0 is the coefficient
    mov r8, r0 ; r8 is the coefficient too
    neg r0 ; r0 is -coefficient
    sar r8, 63 ; r8 is extended sign: -1 if negative else 0
    shr r0, 63 ; r0 is 1 if the number was greater than 0
    or r0, r8 ; r0 is -1, 0, or 1
    shl r0, 8 ; package the number with a zero exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_neg: function_with_one_parameter
    ;(number: dec64) returns negation: dec64

    ; Negate a number. We need to negate the coefficient without changing the
    ; exponent.

    cmp r1_b, 128 ; compare the exponent to nan
    je return_null ; is the number nan?
    mov r2, r1 ; r2 is the number
    mov r0, -256 ; r0 is the coefficient mask
    sar r2, 8 ; r2 is the coefficient
    cmovz r1, r2 ; if coefficient is zero, then exponent too
    xor r0, r1 ; r0 has exponent and complemented coef
    add r0, 256 ; r0 has exponent and negated coefficient
    jo neg_overflow ; nice day if it don't overflow
    ret

    neg_overflow:

    ; The coefficient is -36028797018963968, which is the only coefficient that
    ; cannot be trivially negated. So we do this the hard way.

    mov r0, r2
    movsx r8, r1_b ; r8 is the exponent
    neg r0
    jmp pack

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_abs: function_with_one_parameter
    ;(number: dec64) returns absolution: dec64

    ; Find the absolute value of a number. If the number is negative, hand it off
    ; to dec64_neg. Otherwise, return the number unless it is nan or zero.

    cmp r1_b, 128 ; is the number nan?
    je return_null
    test r1, r1 ; examine r1
    js dec64_neg ; is the number negative?
    mov r0, r1 ; r0 is the number
    sar r1, 8 ; r1 is the coefficient
    cmovz r0, r1 ; if coefficient is zero, result is zero
    ret ; return the number

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_equal: function_with_two_parameters
    ;(comparahend: dec64, comparator: dec64) returns comparison: dec64

    ; Compare two dec64 numbers. If they are equal, return 1, otherwise return 0.
    ; Denormal zeroes are equal but denormal nans are not.

    ; If the numbers are trivally equal, then return 1.

    cmp r1, r2 ; compare the two numbers
    je return_true

    ; If the comparator is nan, then we ask dec64_is_nan.

    cmp r2_b, 128
    je dec64_is_nan

    ; If the exponents match or if their signs are different, then return false.

    mov r0, r1 ; r0 is the first number
    xor r0, r2 ; r0 the xor of the two numbers
    sets r0_h ; r0_h is 1 if the signs are different
    cmp r1_b, r2_b ; compare the two exponents
    sete r0_b ; r0_b is 1 if the exponents are the same
    or r0_b, r0_h
    jnz return_false

    ; Do it the hard way by subtraction. Is the difference zero?

    call_with_two_parameters dec64_subtract ; r0 is r1 - r2
    cmp r0_b, 128 ; is the difference nan?
    je return_false
    or r0, r0 ; examine the difference
    jz return_true
    jmp return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_normal: function_with_one_parameter
    ;(number: dec64) returns normalization: dec64

    ; Make the exponent as close to zero as possible without losing any signficance.
    ; Usually normalization is not needed since it does not materially change the
    ; value of a number.

    ; Preserves r10 and r11.

    mov r0, r1 ; r0 is the number
    cmp r1_b, 128 ; compare the exponent to nan
    jz return_null ; if exponent is nan, the result is nan
    and r0, -256 ; r0 is the coefficient shifted 8 bits
    mov r8, 10 ; r8 is the divisor
    cmovz r1, r0 ; r1 is zero if r0 is zero
    mov r9, r0 ; r9 is the coefficient shifted 8 bits
    test r1_b, r1_b ; examine the exponent
    jz return ; if the exponent is zero, return r0
    jns normal_multiply ; if the exponent is positive
    sar r0, 8 ; r0 is the coefficient
    sar r9, 8 ; r9 is the coefficient
    pad

    normal_divide:

    ; While the exponent is less than zero, divide the coefficient by 10 and
    ; increment the exponent.

    cqo ; sign extend r0 into r2
    idiv r8 ; divide r2:r0 by 10
    test r2, r2 ; examine the remainder
    jnz normal_divide_done ; if r2 is not zero, we are done
    mov r9, r0 ; r9 is the coefficient
    add r1_b, 1 ; increment the exponent
    jnz normal_divide ; until the exponent is zero
    pad

    normal_divide_done:

    mov r0, r9 ; r0 is the finished coefficient
    shl r0, 8 ; put it in position
    mov r0_b, r1_b ; mix in the exponent
    ret
    pad

    normal_multiply:

    ; While the exponent is greater than zero, multiply the coefficient by 10 and
    ; decrement the exponent. If the coefficient gets too large, wrap it up.

    imul r0, 10 ; r0 is r0 * 10
    jo normal_multiply_done ; return zero if overflow
    mov r9, r0 ; r9 is the coefficient
    sub r1_b, 1 ; decrement the exponent
    jnz normal_multiply ; until the exponent is zero
    ret
    pad

    normal_multiply_done:

    mov r0, r9 ; r0 is the finished positioned coefficient
    mov r0_b, r1_b ; mix in the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return:

    ; Return whatever is in r0.

    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_null:

    ; All of the dec64_ functions return only this form of nan.

    mov r0, null
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_one:

    mov r0, 0100h ; one
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    return_zero:

    xor r0, r0 ; zero
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_less: function_with_two_parameters
    ;(comparahend: dec64, comparator: dec64) returns comparison: dec64

    ; Compare two dec64 numbers. If the first is less than the second, return true,
    ; otherwise return false. Any nan value is greater than any number value

    ; The other 3 comparison functions are easily implemented with dec64_is_less:

    ; dec64_is_greater(a, b) => dec64_is_less(b, a)
    ; dec64_is_greater_or_equal(a, b) => dec64_is_false(dec64_is_less(a, b))
    ; dec64_is_less_or_equal(a, b) => dec64_is_false(dec64_is_less(b, a))

    ; If the exponents are the same, then do a simple compare.

    cmp r1_b, r2_b ; are the two exponents equal?
    jne less_slow ; deal with unequal exponents
    cmp r1, r2 ; compare the numbers
    jge return_false ; greater or equal
    cmp r1_b, 128 ; are the arguments nan?
    jne return_true ; less than
    jmp return_false ; nan is not less than nan
    pad

    less_slow:

    ; The exponents are not the same.

    cmp r1_b, 128 ; is the first argument nan?
    je return_false ; nan is greater than numbers
    cmp r2_b, 128 ; is the second argument nan?
    je return_true ; numbers are less than nan

    ; Unpack the numbers.

    mov r11, r2 ; r11 is the second number
    movsx r8, r1_b ; r8 is the first exponent
    movsx r9, r2_b ; r9 is the second exponent
    mov r0, power ; r0 is the power array
    mov r2, 18 ; r2 is 18
    sar r1, 8 ; r1 is the first coefficient
    sar r11, 8 ; r11 is the second coefficient

    ; The maximum cofficient is 36028797018963967. 10**18 is more than that.

    sub r8, r9 ; r8 is the exponent difference
    js less_second ; is the difference negative?
    cmp r8, r2 ; compare the exponent difference to 18
    cmovge r8, r2 ; r8 is min(exponent difference, 18)

    ; We need to make them conform before we can compare. Multiply the first
    ; coefficient by 10**(first exponent - second exponent)

    mov r0, [r0+r8*8] ; r0 is power[difference]
    imul r1 ; r2, r0 is the amplified first coefficient
    jo less_overflow_first
    cmp r0, r11 ; is first coefficient less than the other?
    jl return_true
    jmp return_false
    pad

    less_second:

    neg r8
    cmp r8, r2 ; compare the exponent difference to 18
    cmovge r8, r2 ; r8 is min(exponent difference, 18)
    mov r0, [r0+r8*8] ; r0 is 10 ** min(exponent difference, 18)
    imul r11 ; r2, r0 is the amplified second coefficient
    jo less_overflow_second
    cmp r1, r0 ; is first coefficient less than the other?
    jl return_true
    jmp return_false
    pad

    less_overflow_first:

    test r2, r2
    jns return_false
    jmp return_true
    pad

    less_overflow_second:

    test r2, r2
    jns return_true
    jmp return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_integer: function_with_one_parameter
    ;(number: dec64) returns comparison: dec64

    ; If the number contains a non-zero fractional part or if it is nan, return
    ; false. Otherwise, return true.

    cmp r1_b, 128 ; nan exponent?
    je return_false ; nan is not an integer
    mov r0, r1 ; r0 is the number
    sar r0, 8 ; r0 is the coefficient
    jz return_true ; zero is an integer

    test r1_b, r1_b ; examine the exponent
    jns return_true ; a positive exponent means an integer
    cmp r1_b, -17 ; huge negative exponents can never be int
    jl return_false
    movsx r8, r1_b ; r8 is the first exponent
    neg r8 ; negate the exponent
    mov r9, power
    mov r10, [r9+r8*8] ; r10 is 10^-exponent
    cqo ; sign extend r0 into r2
    idiv r10 ; divide r2:r0 by the power of ten
    test r2, r2 ; examine the remainder
    jz return_true ; if the remainder is zero, then return true
    jmp return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_nan: function_with_one_parameter
    ;(number: dec64) returns comparison: dec64

    cmp r1_b, 128 ; is r1 nan?
    je return_true
    jmp return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_zero: function_with_one_parameter
    ;(number: dec64) returns comparison: dec64

    cmp r1_b, 128 ; is r1 nan?
    setne r0_b ; r0_b is 1 if not nan
    sar r1, 8 ; r1 is the coefficient
    setz r0_h ; r0_h is 1 if zero
    test r0_b, r0_h
    jnz return_true
    jmp return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

    dec64_is_false: function_with_one_parameter
    ;(boolean: dec64) returns notation: dec64

    ; If the argument is false, the result is true.
    ; Otherwise, the result is false.

    mov r0, false
    cmp r1, r0
    je return_true

    return_false:

    mov r0, false
    ret

    return_true:

    mov r0, true
    ret
    59 changes: 59 additions & 0 deletions test_darwin.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,59 @@
    #!/bin/sh

    # This script builds and runs the DEC64 tests for an x64 machine running MacOS
    # (Darwin). It takes no arguments.

    # Assemble DEC64 into dec64.o. We prefix the function names with an underscore,
    # because that is what clang expects.

    nasm \
    -f macho64 \
    --prefix _ \
    dec64.nasm \
    || exit

    build() {

    # Compile, assemble and link a source file into a binary executable.

    # We opt out of creating a position independent executable (a PIE), because we
    # depend on absolute addressing. Without passing the linker the -no_pie flag,
    # we get a warning like

    # PIE disabled. Absolute addressing (perhaps -mdynamic-no-pic) not allowed in
    # code signed PIE, but used in pack_increase from dec64.o.

    # because of lines in dec64.nasm like

    # mov r10, [r10+r9*8]

    # The tests shift negative integers on purpose, so we suppress that warning.

    src=$1
    bin=$2
    clang \
    -Wl,-no_pie \
    -Wno-shift-negative-value \
    -o $bin \
    dec64_string.c \
    dec64_math.c \
    dec64.o \
    $src
    }

    # Build the tests.

    build dec64_test.c dec64_test \
    && build dec64_string_test.c dec64_string_test \
    && build dec64_math_test.c dec64_math_test \
    || exit

    # Run the tests.

    ./dec64_test \
    && ./dec64_string_test \
    && ./dec64_math_test \
    || exit

    # This script has been tested on MacOS 11.4 (Big Sur), using NASM
    # v2.15.05 and clang 13.0.0.
    51 changes: 51 additions & 0 deletions test_linux.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,51 @@
    #!/bin/sh

    # This script builds and runs the DEC64 tests for an x64 machine running Linux.
    # It takes no arguments.

    # Assemble DEC64 into dec64.o.

    nasm -f elf64 dec64.nasm || exit

    build() {

    # Compile, assemble and link a source file into a binary executable.

    # We opt out of creating a position independent executable (a PIE), because we
    # depend on absolute addressing. Without passing the the -no-pie flag, we get a
    # warning like

    # /usr/bin/ld: dec64.o: warning: relocation in read-only section `.text'
    # /usr/bin/ld: warning: creating DT_TEXTREL in a PIE

    # because of lines in dec64.nasm like

    # mov r10, [r10+r9*8]

    src=$1
    bin=$2
    gcc \
    -no-pie \
    -o $bin \
    dec64_string.c \
    dec64_math.c \
    dec64.o \
    $src
    }

    # Build the tests.

    build dec64_test.c dec64_test \
    && build dec64_string_test.c dec64_string_test \
    && build dec64_math_test.c dec64_math_test \
    || exit

    # Run the tests.

    ./dec64_test \
    && ./dec64_string_test \
    && ./dec64_math_test \
    || exit

    # This script has been tested on Debian GNU/Linux 11 (Bullseye), using NASM
    # v2.15.05 and gcc v10.2.1.