Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save define-private-public/07945570cbfe49b98bd88f0f8ea66e19 to your computer and use it in GitHub Desktop.

Select an option

Save define-private-public/07945570cbfe49b98bd88f0f8ea66e19 to your computer and use it in GitHub Desktop.

Revisions

  1. define-private-public revised this gist Jan 19, 2025. 2 changed files with 108 additions and 0 deletions.
    61 changes: 61 additions & 0 deletions analytical_in_unit_disk_gcc_14.2_O3.asm
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    .L44:
    divsd xmm6, QWORD PTR [rsp+8]
    comisd xmm6, QWORD PTR .LC1[rip]
    jnb .L30
    pxor xmm0, xmm0
    addsd xmm6, xmm0
    ucomisd xmm0, xmm6
    ja .L42
    sqrtsd xmm6, xmm6
    .L16:
    mov rax, QWORD PTR .LC1[rip]
    mov ecx, 2
    pxor xmm7, xmm7
    mov QWORD PTR [rsp+8], rax
    cmp r12, 624
    je .L17
    .L46:
    mov rdx, QWORD PTR [rsp+32+r12*8]
    add r12, 1
    .L18:
    mov rax, rdx
    shr rax, 11
    mov eax, eax
    xor rax, rdx
    mov rdx, rax
    sal rdx, 7
    and edx, 2636928640
    xor rax, rdx
    mov rdx, rax
    sal rdx, 15
    and edx, 4022730752
    xor rax, rdx
    mov rdx, rax
    shr rdx, 18
    xor rax, rdx
    js .L21
    pxor xmm0, xmm0
    cvtsi2sd xmm0, rax
    .L22:
    mulsd xmm0, QWORD PTR [rsp+8]
    fld DWORD PTR .LC7[rip]
    fmul QWORD PTR [rsp+8]
    addsd xmm7, xmm0
    fstp QWORD PTR [rsp+8]
    cmp ecx, 1
    jne .L31
    divsd xmm7, QWORD PTR [rsp+8]
    comisd xmm7, QWORD PTR .LC1[rip]
    jnb .L32
    pxor xmm0, xmm0
    lea rdi, [rsp+24]
    lea rsi, [rsp+16]
    movsd QWORD PTR [rsp+8], xmm6
    addsd xmm7, xmm0
    mulsd xmm7, QWORD PTR .LC8[rip]
    addsd xmm0, xmm7
    call sincos
    movsd xmm6, QWORD PTR [rsp+8]
    movsd xmm0, QWORD PTR [rsp+16]
    movsd xmm1, QWORD PTR [rsp+24]
    mulsd xmm0, xmm6
    47 changes: 47 additions & 0 deletions rejection_in_unit_disk_gcc_14.2_O3.asm
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,47 @@
    .L16:
    movdqa xmm1, XMMWORD PTR [rax+8]
    movdqu xmm0, XMMWORD PTR [rax]
    add rax, 16
    pand xmm0, xmm5
    pand xmm1, xmm3
    por xmm1, xmm0
    movdqa xmm2, xmm1
    pand xmm1, xmm4
    movdqa xmm0, xmm1
    psrlq xmm2, 1
    pxor xmm2, XMMWORD PTR [rax-1832]
    psllq xmm0, 3
    paddq xmm0, xmm1
    psllq xmm0, 9
    paddq xmm0, xmm1
    psllq xmm0, 5
    paddq xmm0, xmm1
    psllq xmm0, 2
    psubq xmm0, xmm1
    psllq xmm0, 3
    psubq xmm0, xmm1
    movdqa xmm10, xmm0
    psllq xmm10, 4
    paddq xmm0, xmm10
    psllq xmm0, 5
    psubq xmm0, xmm1
    pxor xmm2, xmm0
    movups XMMWORD PTR [rax-16], xmm2
    cmp rax, rsi
    jne .L16
    mov rsi, QWORD PTR [rsp+16]
    mov rax, QWORD PTR [rsp+5000]
    mov r12d, 1
    mov rdi, rsi
    and rax, -2147483648
    and edi, 2147483647
    or rax, rdi
    mov rdi, rax
    and eax, 1
    neg rax
    shr rdi
    xor rdi, QWORD PTR [rsp+3184]
    and eax, 2567483615
    xor rax, rdi
    mov QWORD PTR [rsp+5000], rax
    jmp .L14
  2. define-private-public revised this gist Jan 19, 2025. 1 changed file with 0 additions and 92 deletions.
    92 changes: 0 additions & 92 deletions analytical_in_unit_disk_gcc_14.2_O0.asm
    Original file line number Diff line number Diff line change
    @@ -1,95 +1,3 @@
    RNG::analytical_in_unit_disk(): ; 49 instructions
    push rbp
    mov rbp, rsp
    sub rsp, 64
    mov QWORD PTR [rbp-56], rdi
    movsd xmm0, QWORD PTR .LC1[rip]
    mov rax, QWORD PTR [rbp-56]
    movapd xmm1, xmm0
    mov rdx, QWORD PTR .LC2[rip]
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rax, xmm0
    movq xmm0, rax
    call sqrt
    movq rax, xmm0
    mov QWORD PTR [rbp-8], rax
    movsd xmm0, QWORD PTR .LC3[rip]
    mov rax, QWORD PTR [rbp-56]
    movapd xmm1, xmm0
    mov rdx, QWORD PTR .LC2[rip]
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rax, xmm0
    mov QWORD PTR [rbp-16], rax
    mov rax, QWORD PTR [rbp-16]
    movq xmm0, rax
    call cos
    movsd xmm1, QWORD PTR [rbp-8]
    mulsd xmm0, xmm1
    movsd QWORD PTR [rbp-24], xmm0
    mov rax, QWORD PTR [rbp-16]
    movq xmm0, rax
    call sin
    movsd xmm1, QWORD PTR [rbp-8]
    mulsd xmm0, xmm1
    movsd QWORD PTR [rbp-32], xmm0
    movsd xmm0, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rbp-24]
    lea rax, [rbp-48]
    movapd xmm1, xmm0
    movq xmm0, rdx
    mov rdi, rax
    call Vec2::Vec2(double, double) [complete object constructor] ; 14 instructions
    mov rax, QWORD PTR [rbp-48]
    mov rdx, QWORD PTR [rbp-40]
    movq xmm0, rax
    movq xmm1, rdx
    leave
    ret

    --------

    The above method:

    (49 - 3) + 19 + 19 + 14: 98 instrunctions (albiet 6 calls)ec2::Vec2(double, double) [base object constructor]: ; 14 instructions
    push rbp
    mov rbp, rsp
    mov QWORD PTR [rbp-8], rdi
    movsd QWORD PTR [rbp-16], xmm0
    movsd QWORD PTR [rbp-24], xmm1
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-16]
    movsd QWORD PTR [rax], xmm0
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-24]
    movsd QWORD PTR [rax+8], xmm0
    nop
    pop rbp
    ret
    RNG::num(double, double): ; 19 instructions
    push rbp
    mov rbp, rsp
    sub rsp, 48
    mov QWORD PTR [rbp-24], rdi
    movsd QWORD PTR [rbp-32], xmm0
    movsd QWORD PTR [rbp-40], xmm1
    mov rax, QWORD PTR [rbp-24]
    lea rdx, [rax+5000]
    mov rax, QWORD PTR [rbp-24]
    mov rsi, rax
    mov rdi, rdx
    call double std::uniform_real_distribution<double>::operator()<std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul> >(std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>&)
    movq rax, xmm0
    mov QWORD PTR [rbp-8], rax
    movsd xmm0, QWORD PTR [rbp-40]
    subsd xmm0, QWORD PTR [rbp-32]
    mulsd xmm0, QWORD PTR [rbp-8]
    addsd xmm0, QWORD PTR [rbp-32]
    leave
    ret
    RNG::analytical_in_unit_disk(): ; 49 instructions
    push rbp
    mov rbp, rsp
  3. define-private-public revised this gist Jan 19, 2025. 2 changed files with 59 additions and 72 deletions.
    66 changes: 58 additions & 8 deletions analytical_in_unit_disk_gcc_14.2_O0.asm
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,60 @@
    Vec2::Vec2(double, double) [base object constructor]: ; 14 instructions
    RNG::analytical_in_unit_disk(): ; 49 instructions
    push rbp
    mov rbp, rsp
    sub rsp, 64
    mov QWORD PTR [rbp-56], rdi
    movsd xmm0, QWORD PTR .LC1[rip]
    mov rax, QWORD PTR [rbp-56]
    movapd xmm1, xmm0
    mov rdx, QWORD PTR .LC2[rip]
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rax, xmm0
    movq xmm0, rax
    call sqrt
    movq rax, xmm0
    mov QWORD PTR [rbp-8], rax
    movsd xmm0, QWORD PTR .LC3[rip]
    mov rax, QWORD PTR [rbp-56]
    movapd xmm1, xmm0
    mov rdx, QWORD PTR .LC2[rip]
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rax, xmm0
    mov QWORD PTR [rbp-16], rax
    mov rax, QWORD PTR [rbp-16]
    movq xmm0, rax
    call cos
    movsd xmm1, QWORD PTR [rbp-8]
    mulsd xmm0, xmm1
    movsd QWORD PTR [rbp-24], xmm0
    mov rax, QWORD PTR [rbp-16]
    movq xmm0, rax
    call sin
    movsd xmm1, QWORD PTR [rbp-8]
    mulsd xmm0, xmm1
    movsd QWORD PTR [rbp-32], xmm0
    movsd xmm0, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rbp-24]
    lea rax, [rbp-48]
    movapd xmm1, xmm0
    movq xmm0, rdx
    mov rdi, rax
    call Vec2::Vec2(double, double) [complete object constructor] ; 14 instructions
    mov rax, QWORD PTR [rbp-48]
    mov rdx, QWORD PTR [rbp-40]
    movq xmm0, rax
    movq xmm1, rdx
    leave
    ret

    --------

    The above method:

    (49 - 3) + 19 + 19 + 14: 98 instrunctions (albiet 6 calls)ec2::Vec2(double, double) [base object constructor]: ; 14 instructions
    push rbp
    mov rbp, rsp
    mov QWORD PTR [rbp-8], rdi
    @@ -84,10 +140,4 @@ RNG::analytical_in_unit_disk(): ; 49 instructions
    movq xmm0, rax
    movq xmm1, rdx
    leave
    ret

    --------

    The above method:

    (49 - 3) + 19 + 19 + 14: 98 instrunctions (albiet 6 calls)
    ret
    65 changes: 1 addition & 64 deletions rejection_in_unit_disk_gcc_14.2_O0.asm
    Original file line number Diff line number Diff line change
    @@ -1,56 +1,3 @@
    Vec2::Vec2(double, double) [base object constructor]: ; 14 instructions
    push rbp
    mov rbp, rsp
    mov QWORD PTR [rbp-8], rdi
    movsd QWORD PTR [rbp-16], xmm0
    movsd QWORD PTR [rbp-24], xmm1
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-16]
    movsd QWORD PTR [rax], xmm0
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-24]
    movsd QWORD PTR [rax+8], xmm0
    nop
    pop rbp
    ret
    Vec2::length_squared() const: ; 16 instructions
    push rbp
    mov rbp, rsp
    mov QWORD PTR [rbp-8], rdi
    mov rax, QWORD PTR [rbp-8]
    movsd xmm1, QWORD PTR [rax]
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rax]
    mulsd xmm1, xmm0
    mov rax, QWORD PTR [rbp-8]
    movsd xmm2, QWORD PTR [rax+8]
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rax+8]
    mulsd xmm0, xmm2
    addsd xmm0, xmm1
    pop rbp
    ret
    RNG::num(double, double): ; 19 instructions
    push rbp
    mov rbp, rsp
    sub rsp, 48
    mov QWORD PTR [rbp-24], rdi
    movsd QWORD PTR [rbp-32], xmm0
    movsd QWORD PTR [rbp-40], xmm1
    mov rax, QWORD PTR [rbp-24]
    lea rdx, [rax+5000]
    mov rax, QWORD PTR [rbp-24]
    mov rsi, rax
    mov rdi, rdx
    call double std::uniform_real_distribution<double>::operator()<std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul> >(std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>&)
    movq rax, xmm0
    mov QWORD PTR [rbp-8], rax
    movsd xmm0, QWORD PTR [rbp-40]
    subsd xmm0, QWORD PTR [rbp-32]
    mulsd xmm0, QWORD PTR [rbp-8]
    addsd xmm0, QWORD PTR [rbp-32]
    leave
    ret
    RNG::rejection_in_unit_disk(): ; 41 instructions
    push rbp
    mov rbp, rsp
    @@ -93,14 +40,4 @@ RNG::rejection_in_unit_disk(): ; 41 instructions
    movq xmm1, rdx
    mov rbx, QWORD PTR [rbp-8]
    leave
    ret

    --------

    This needs to traverse quite a few instructions, perform calls (and can loop infinately). Let's say we had success on our first go:
    41 + 19 + 19 + 14 + 16: 109 instructions (w/ 4 calls)
    If we had one failure but then a success, so the loop is run twice:
    5 + 2 * (29 + 19 + 19 + 14 + 16) + 7: 207 instructions (w/ 8 calls)
    I may be a little off on the math
    ret
  4. define-private-public created this gist Jan 19, 2025.
    93 changes: 93 additions & 0 deletions analytical_in_unit_disk_gcc_14.2_O0.asm
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,93 @@
    Vec2::Vec2(double, double) [base object constructor]: ; 14 instructions
    push rbp
    mov rbp, rsp
    mov QWORD PTR [rbp-8], rdi
    movsd QWORD PTR [rbp-16], xmm0
    movsd QWORD PTR [rbp-24], xmm1
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-16]
    movsd QWORD PTR [rax], xmm0
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-24]
    movsd QWORD PTR [rax+8], xmm0
    nop
    pop rbp
    ret
    RNG::num(double, double): ; 19 instructions
    push rbp
    mov rbp, rsp
    sub rsp, 48
    mov QWORD PTR [rbp-24], rdi
    movsd QWORD PTR [rbp-32], xmm0
    movsd QWORD PTR [rbp-40], xmm1
    mov rax, QWORD PTR [rbp-24]
    lea rdx, [rax+5000]
    mov rax, QWORD PTR [rbp-24]
    mov rsi, rax
    mov rdi, rdx
    call double std::uniform_real_distribution<double>::operator()<std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul> >(std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>&)
    movq rax, xmm0
    mov QWORD PTR [rbp-8], rax
    movsd xmm0, QWORD PTR [rbp-40]
    subsd xmm0, QWORD PTR [rbp-32]
    mulsd xmm0, QWORD PTR [rbp-8]
    addsd xmm0, QWORD PTR [rbp-32]
    leave
    ret
    RNG::analytical_in_unit_disk(): ; 49 instructions
    push rbp
    mov rbp, rsp
    sub rsp, 64
    mov QWORD PTR [rbp-56], rdi
    movsd xmm0, QWORD PTR .LC1[rip]
    mov rax, QWORD PTR [rbp-56]
    movapd xmm1, xmm0
    mov rdx, QWORD PTR .LC2[rip]
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rax, xmm0
    movq xmm0, rax
    call sqrt
    movq rax, xmm0
    mov QWORD PTR [rbp-8], rax
    movsd xmm0, QWORD PTR .LC3[rip]
    mov rax, QWORD PTR [rbp-56]
    movapd xmm1, xmm0
    mov rdx, QWORD PTR .LC2[rip]
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rax, xmm0
    mov QWORD PTR [rbp-16], rax
    mov rax, QWORD PTR [rbp-16]
    movq xmm0, rax
    call cos
    movsd xmm1, QWORD PTR [rbp-8]
    mulsd xmm0, xmm1
    movsd QWORD PTR [rbp-24], xmm0
    mov rax, QWORD PTR [rbp-16]
    movq xmm0, rax
    call sin
    movsd xmm1, QWORD PTR [rbp-8]
    mulsd xmm0, xmm1
    movsd QWORD PTR [rbp-32], xmm0
    movsd xmm0, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rbp-24]
    lea rax, [rbp-48]
    movapd xmm1, xmm0
    movq xmm0, rdx
    mov rdi, rax
    call Vec2::Vec2(double, double) [complete object constructor] ; 14 instructions
    mov rax, QWORD PTR [rbp-48]
    mov rdx, QWORD PTR [rbp-40]
    movq xmm0, rax
    movq xmm1, rdx
    leave
    ret

    --------

    The above method:

    (49 - 3) + 19 + 19 + 14: 98 instrunctions (albiet 6 calls)
    106 changes: 106 additions & 0 deletions rejection_in_unit_disk_gcc_14.2_O0.asm
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,106 @@
    Vec2::Vec2(double, double) [base object constructor]: ; 14 instructions
    push rbp
    mov rbp, rsp
    mov QWORD PTR [rbp-8], rdi
    movsd QWORD PTR [rbp-16], xmm0
    movsd QWORD PTR [rbp-24], xmm1
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-16]
    movsd QWORD PTR [rax], xmm0
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rbp-24]
    movsd QWORD PTR [rax+8], xmm0
    nop
    pop rbp
    ret
    Vec2::length_squared() const: ; 16 instructions
    push rbp
    mov rbp, rsp
    mov QWORD PTR [rbp-8], rdi
    mov rax, QWORD PTR [rbp-8]
    movsd xmm1, QWORD PTR [rax]
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rax]
    mulsd xmm1, xmm0
    mov rax, QWORD PTR [rbp-8]
    movsd xmm2, QWORD PTR [rax+8]
    mov rax, QWORD PTR [rbp-8]
    movsd xmm0, QWORD PTR [rax+8]
    mulsd xmm0, xmm2
    addsd xmm0, xmm1
    pop rbp
    ret
    RNG::num(double, double): ; 19 instructions
    push rbp
    mov rbp, rsp
    sub rsp, 48
    mov QWORD PTR [rbp-24], rdi
    movsd QWORD PTR [rbp-32], xmm0
    movsd QWORD PTR [rbp-40], xmm1
    mov rax, QWORD PTR [rbp-24]
    lea rdx, [rax+5000]
    mov rax, QWORD PTR [rbp-24]
    mov rsi, rax
    mov rdi, rdx
    call double std::uniform_real_distribution<double>::operator()<std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul> >(std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>&)
    movq rax, xmm0
    mov QWORD PTR [rbp-8], rax
    movsd xmm0, QWORD PTR [rbp-40]
    subsd xmm0, QWORD PTR [rbp-32]
    mulsd xmm0, QWORD PTR [rbp-8]
    addsd xmm0, QWORD PTR [rbp-32]
    leave
    ret
    RNG::rejection_in_unit_disk(): ; 41 instructions
    push rbp
    mov rbp, rsp
    push rbx
    sub rsp, 40
    mov QWORD PTR [rbp-40], rdi
    .L12:
    movsd xmm0, QWORD PTR .LC1[rip]
    mov rdx, QWORD PTR .LC3[rip]
    mov rax, QWORD PTR [rbp-40]
    movapd xmm1, xmm0
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rbx, xmm0
    movsd xmm0, QWORD PTR .LC1[rip]
    mov rdx, QWORD PTR .LC3[rip]
    mov rax, QWORD PTR [rbp-40]
    movapd xmm1, xmm0
    movq xmm0, rdx
    mov rdi, rax
    call RNG::num(double, double) ; 19 instructions
    movq rax, xmm0
    lea rdx, [rbp-32]
    movq xmm1, rbx
    movq xmm0, rax
    mov rdi, rdx
    call Vec2::Vec2(double, double) [complete object constructor] ; 14 instructions
    lea rax, [rbp-32]
    mov rdi, rax
    call Vec2::length_squared() const ; 16 instructions
    movsd xmm1, QWORD PTR .LC1[rip]
    comisd xmm1, xmm0
    seta al
    test al, al
    je .L12 ; Possible jump to repeat the above (29 instructions)
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rbp-24]
    movq xmm0, rax
    movq xmm1, rdx
    mov rbx, QWORD PTR [rbp-8]
    leave
    ret

    --------

    This needs to traverse quite a few instructions, perform calls (and can loop infinately). Let's say we had success on our first go:
    41 + 19 + 19 + 14 + 16: 109 instructions (w/ 4 calls)
    If we had one failure but then a success, so the loop is run twice:
    5 + 2 * (29 + 19 + 19 + 14 + 16) + 7: 207 instructions (w/ 8 calls)
    I may be a little off on the math