Skip to content

Instantly share code, notes, and snippets.

@non
Last active February 18, 2025 04:43
Show Gist options
  • Select an option

  • Save non/4d9060b7a0b9aba0bddadfc6ba3e3442 to your computer and use it in GitHub Desktop.

Select an option

Save non/4d9060b7a0b9aba0bddadfc6ba3e3442 to your computer and use it in GitHub Desktop.

Revisions

  1. non revised this gist Feb 3, 2022. 2 changed files with 102 additions and 8 deletions.
    47 changes: 47 additions & 0 deletions grep.tal
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,47 @@
    ( grep.tal )
    ( )
    ( by d_m )

    ( print a character to STDOUT )
    %emit { #18 DEO }

    ( the first argument to grep is the regex )
    ( arguments are passed on STDIN, so we just )
    ( assume the first "line" is the argument )
    ( and the rest are lines to be grepped )
    |0100
    ;r-read-stdin #10 DEO2 BRK

    @regex 0000 ( compiled regex address (if any) )
    @buffer $1000 ( buffer to read user input )
    @ptr :buffer ( next byte to write in buffer )

    @println ( s* -> )
    &loop LDAk #00 EQU ,&eof JCN ( did we reach \0 ? )
    LDAk emit INC2 ,&loop JMP ( no so emit a char and repeat )
    &eof #0a emit POP2 JMP2r ( yes so emit \n and return )

    @r-read-stdin ( -> )
    #12 DEI #0a EQU ,&execute JCN ( did we read \n ? )
    #12 DEI ;ptr LDA2 STA ( no, so save in buffer )
    ;ptr LDA2k INC2 SWP2 STA2 ( ptr++ )
    BRK ( return )

    &execute ( we saw a newline, so do something )
    #00 ;ptr LDA2 STA ( null terminate str )
    ;buffer ;ptr STA2 ( reset ptr )
    ;regex LDA2 #0000 EQU2 ( is regex initialized? )
    ,&need-regex JCN ( jump if unset )

    ( regex is set )
    ;buffer ;regex LDA2 ;match JSR2 ( match regex )
    #00 EQU ,&no-match JCN ( did we match? )
    ;buffer ;println JSR2 ( print any match )
    &no-match BRK ( return )

    ( regex is unset )
    &need-regex ;buffer ;compile JSR2 ( compile regex )
    ;regex STA2 BRK ( save regex and return )

    ( include the actual regex machinery )
    ~regex.tal
    63 changes: 55 additions & 8 deletions regex.tal
    Original file line number Diff line number Diff line change
    @@ -52,6 +52,8 @@
    @stack-is-empty "stack 20 "is 20 "empty 00
    @arena-is-full "arena 20 "is 20 "full 00
    @star-invariant "star 20 "invariant 20 "failed 00
    @plus-invariant "plus 20 "invariant 20 "failed 00
    @qmark-invariant "question 20 "mark 20 "invariant 20 "failed 00

    ( REGEX MATCHING )

    @@ -80,7 +82,7 @@
    LDAk #03 EQU ;do-literal JCN2
    LDAk #04 EQU ;do-or JCN2
    LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
    LDAk ;emit-byte JSR2 ;unknown-node-type ;error! JSR2
    ;unknown-node-type ;error! JSR2

    ( used when we hit a dead-end during matching. )
    ( )
    @@ -152,15 +154,21 @@
    @peek-to-star ( -> is-star^ )
    ;pos LDA2 LDA LIT '* EQU JMP2r

    ( is pos currently pointing to a plus? )
    @peek-to-plus ( -> is-plus^ )
    ;pos LDA2 LDA LIT '+ EQU JMP2r

    ( is pos currently pointing to a qmark? )
    @peek-to-qmark ( -> is-qmark^ )
    ;pos LDA2 LDA LIT '? EQU JMP2r

    ( just increment pos )
    @skip
    ;pos LDA2 INC2 ;pos STA2 JMP2r

    ( TODO: )
    ( 1. character groups: [] and [^] )
    ( 2. one-or-more: + )
    ( 3. zero-or-one: ? )
    ( 4. symbolic escapes, e.g. \n )
    ( 2. symbolic escapes, e.g. \n )

    ( STRETCH GOALS: )
    ( a. ^ and $ )
    @@ -205,6 +213,8 @@
    DUP LIT ') EQU ;c-rpar JCN2
    DUP LIT '\ EQU ;c-esc JCN2
    DUP LIT '* EQU ;c-star JCN2
    DUP LIT '+ EQU ;c-plus JCN2
    DUP LIT '? EQU ;c-qmark JCN2
    ;c-char JMP2

    ( either finalize the given r0/r1 or else wrap it in )
    @@ -213,9 +223,13 @@
    ( we use this look-ahead approach rather than compiling )
    ( star nodes directly since the implementation is simpler. )
    @c-peek-and-finalize ( r0* r1* -> r2* )
    ;peek-to-star JSR2 ( r0 r1 next-is-star? )
    ,&next-is-star JCN ,&finally JMP ( r0 r1 )
    &next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ( star )
    ;peek-to-star JSR2 ( r0 r1 next-is-star? ) ,&next-is-star JCN
    ;peek-to-plus JSR2 ( r0 r1 next-is-plus? ) ,&next-is-plus JCN
    ;peek-to-qmark JSR2 ( r0 r1 next-is-qmark? ) ,&next-is-qmark JCN
    ,&finally JMP ( r0 r1 )
    &next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ,&finally JMP
    &next-is-plus ;skip JSR2 POP2 ;alloc-plus JSR2 DUP2 ,&finally JMP
    &next-is-qmark ;skip JSR2 POP2 ;alloc-qmark JSR2 DUP2 ,&finally JMP
    &finally ;push-next JSR2 ;compile-region-loop JMP2

    ( called when we reach EOF of the input string )
    @@ -302,6 +316,20 @@
    POP
    ;star-invariant ;error! JSR2

    ( called if we parse a "+" )
    ( )
    ( actually calling this means the code broke an invariant somewhere. )
    @c-plus ( c^ -> regex* )
    POP
    ;plus-invariant ;error! JSR2

    ( called if we parse a "?" )
    ( )
    ( actually calling this means the code broke an invariant somewhere. )
    @c-qmark ( c^ -> regex* )
    POP
    ;qmark-invariant ;error! JSR2

    ( ALLOCATING REGEX NDOES )

    @alloc3 ( mode^ -> r* )
    @@ -341,6 +369,25 @@
    ;set-next JSR2 ( [r] )
    STH2r JMP2r

    @alloc-plus ( expr* -> r* )
    #05 ;alloc JSR2 STH2 ( expr [r] )
    #05 STH2kr STA ( expr [r] )
    DUP2 STH2kr INC2 STA2 ( expr [r] )
    #0000 STH2kr #0003 ADD2 STA2 ( expr [r] )
    STH2r SWP2 STH2k ( r expr [expr] )
    ;set-next JSR2 ( [expr] )
    STH2r JMP2r

    @alloc-qmark ( expr* -> r* )
    ;alloc-empty JSR2 STH2k ( expr e [e] )
    OVR2 ;set-next JSR2 ( expr [e] )
    #05 ;alloc JSR2 STH2 ( expr [r e] )
    #04 STH2kr STA ( expr [r e] )
    STH2kr INC2 STA2 ( [r e] )
    SWP2r STH2r STH2kr ( e r [r] )
    #0003 ADD2 STA2 ( [r] )
    STH2r JMP2r

    ( if r is 0000, allocate an empty node )
    @alloc-if-null ( r* -> r2* )
    ORAk ,&return JCN POP2 ;alloc-empty JSR2 &return JMP2r
    @@ -400,7 +447,7 @@
    &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
    &!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r
    &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
    &!5 LDAk ;emit-byte JSR2 ;unknown-node-type ;error! JSR2
    &!5 ;unknown-node-type ;error! JSR2

    @set-next-or-addr ( target* addr* -> )
    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
  2. non revised this gist Feb 2, 2022. 2 changed files with 142 additions and 9 deletions.
    43 changes: 34 additions & 9 deletions regex.tal
    Original file line number Diff line number Diff line change
    @@ -80,7 +80,7 @@
    LDAk #03 EQU ;do-literal JCN2
    LDAk #04 EQU ;do-or JCN2
    LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
    ;unknown-node-type ;error! JSR2
    LDAk ;emit-byte JSR2 ;unknown-node-type ;error! JSR2

    ( used when we hit a dead-end during matching. )
    ( )
    @@ -341,6 +341,10 @@
    ;set-next JSR2 ( [r] )
    STH2r JMP2r

    ( if r is 0000, allocate an empty node )
    @alloc-if-null ( r* -> r2* )
    ORAk ,&return JCN POP2 ;alloc-empty JSR2 &return JMP2r

    ( unroll one region of the parsing stack, returning )
    ( a single node consisting of an alternation of )
    ( all elements on the stack. )
    @@ -350,7 +354,7 @@
    @unroll-stack ( -> start* end* )
    ;pop4 JSR2 STH2 ( r )
    #00 STH ( count items in stack frame )
    DUP2 #0000 NEQ2 ,&loop JCN ;alloc-empty JSR2
    ;alloc-if-null JSR2 ( replace 0000 with empty )
    &loop ( r* )
    ;pop4 JSR2 POP2 ( r x )
    DUP2 #ffff EQU2 ( r x x-is-end? ) ,&done JCN
    @@ -364,7 +368,7 @@
    &is-or
    POP2r
    ;alloc-empty JSR2 OVR2 OVR2 SWP2 ( r empty empty r )
    ;set-next JSR2
    ;set-next-or JSR2
    JMP2r

    ( add r to the top of the stock. )
    @@ -393,13 +397,25 @@
    @set-next ( target* regex* -> )
    LDAk #01 NEQ ,&!1 JCN INC2 ;set-next-addr JSR2 JMP2r
    &!1 LDAk #02 NEQ ,&!2 JCN INC2 ;set-next-addr JSR2 JMP2r
    &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
    &!3 LDAk #04 NEQ ,&!4 JCN
    ( todo: this is probably broken )
    OVR2 OVR2 INC2 ;set-next-addr JSR2
    #0003 ADD2 ;set-next-addr JSR2 JMP2r
    &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
    &!3 LDAk #04 NEQ ,&!4 JCN INC2 ;set-next-addr JSR2 JMP2r
    &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
    &!5 ;unknown-node-type ;error! JSR2
    &!5 LDAk ;emit-byte JSR2 ;unknown-node-type ;error! JSR2

    @set-next-or-addr ( target* addr* -> )
    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
    LDA2 ;set-next-or JSR2 JMP2r
    &is-zero STA2 JMP2r

    ( this is used when first building or-nodes )
    ( structure will always be: )
    ( [x1, [x2, [x3, ..., [xm, xn]]]] )
    ( so we recurse on the right side but not the left. )
    @set-next-or ( target* regex* -> )
    LDAk #04 NEQ ,&!4 JCN
    OVR2 OVR2 INC2 ;set-next-addr JSR2
    #0003 ADD2 ;set-next-or-addr JSR2 JMP2r
    &!4 ;set-next JMP2

    ( STACK OPERATIONS )
    ( )
    @@ -431,6 +447,15 @@
    STH2r STH2r ( restore str and regex )
    JMP2r

    ( -> size^ )
    @frame-size
    #00 STH ;stack-pos LDA2
    &loop
    #0004 SUB2 LDA2k #ffff EQU2 ,&done JCN
    INCr ,&loop JMP
    &done
    STHr JMP2r

    ( reset stack pointers )
    @reset-stack ( -> )
    ;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )
    108 changes: 108 additions & 0 deletions repl-regex.tal
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,108 @@
    ( repl-regex.tal )

    %dbg { #ff #0e DEO }
    %sp { #20 #18 DEO }
    %nl { #0a #18 DEO }
    %exit { #01 #0f DEO BRK }

    ( read in regular expressions )
    ( and emit internal structures parsed )
    |0100
    ;r-prompt ;println JSR2
    ;r-read-stdin #10 DEO2 BRK

    ( we use two different prompts depending on what mode we're in )
    @r-prompt "enter 20 "regex: 20 00
    @s-prompt "string 20 "to 20 "match: 20 00


    @regex $2 ( compiled regex address (if any) )
    @buffer $1000 ( buffer to read user input )
    @ptr :buffer ( next byte to write in buffer )

    @println ( s* -> )
    &loop LDAk #00 EQU ,&eof JCN
    LDAk #18 DEO INC2 ,&loop JMP
    &eof POP2 JMP2r

    @r-read-stdin ( -> )
    #12 DEI #0a EQU ,&execute JCN
    #12 DEI ;ptr LDA2 STA
    ;ptr LDA2k INC2 SWP2 STA2
    BRK
    &execute
    #00 ;ptr LDA2 STA
    ;buffer ;ptr STA2
    ;buffer ;compile JSR2 dbg nl
    DUP2 ;regex STA2
    ;emit-stack JSR2 nl
    ;emit-arena JSR2 nl
    ;reset-arena JSR2
    POP2
    ;s-prompt ;println JSR2
    ;s-read-stdin #10 DEO2 BRK
    BRK

    @s-read-stdin ( -> )
    #12 DEI #0a EQU ,&execute JCN
    #12 DEI ;ptr LDA2 STA
    ;ptr LDA2k INC2 SWP2 STA2
    BRK
    &execute
    #00 ;ptr LDA2 STA ( null terminate string )
    ;ptr LDA2 ;buffer EQU2 STH ( stash is-empty? )
    ;buffer ;ptr STA2 ( reset ptr )
    ;buffer ;regex LDA2 ;match JSR2 ( match regex )
    ;emit-byte JSR2 nl ( print result )
    STHr ,&was-empty JCN
    ;s-prompt ;println JSR2
    BRK
    &was-empty
    ;r-prompt ;println JSR2
    ;r-read-stdin #10 DEO2 BRK
    BRK

    ~regex.tal

    @emit-short ( short* -- )
    SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r

    @emit-byte ( byte^ -- )
    DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP
    &hex #30 ADD DUP #39 GTH #27 MUL ADD emit
    JMP2r

    ( print stack size, followed by contents )
    @emit-stack ( -> )
    space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit
    ;stack-bot
    &loop
    DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN
    POP2 newline JMP2r
    &ok
    space LDA2k ;emit-short JSR2
    #0002 ADD2 ,&loop JMP

    ( emit n bytes from the given address )
    @emit-n ( addr* count^ -> addr2* )
    DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r
    &ok
    STH ( addr [count] ) space LDAk ;emit-byte JSR2 INC2 ( addr+1 [count] )
    STHr #01 SUB ( addr+1 count-1 )
    ;emit-n JMP2

    ( emit the arena, with one line per node )
    ( parses node type, since node size is dynamic (3-5). )
    @emit-arena ( -> )
    ;arena-bot
    &loop
    DUP2 ;arena-pos LDA2 LTH2 ,&ok JCN POP2 JMP2r
    &ok
    DUP2 ;emit-short JSR2
    LIT ': emit space
    LDAk #01 NEQ ,&!1 JCN #03 ;emit-n JSR2 ,&loop JMP
    &!1 LDAk #02 NEQ ,&!2 JCN #03 ;emit-n JSR2 ,&loop JMP
    &!2 LDAk #03 NEQ ,&!3 JCN #04 ;emit-n JSR2 ,&loop JMP
    &!3 LDAk #04 NEQ ,&!4 JCN #05 ;emit-n JSR2 ,&loop JMP
    &!4 LDAk #05 NEQ ,&!5 JCN #05 ;emit-n JSR2 ,&loop JMP
    &!5 ;unknown-node-type ;error! JSR2
  3. non revised this gist Jan 31, 2022. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion regex.tal
    Original file line number Diff line number Diff line change
    @@ -489,6 +489,5 @@
    STH2r JMP2r ( pos )
    &error POP2 POP2r ;arena-is-full ;error! JSR2

    |1ffe
    @arena-pos :arena-bot ( the next position to allocate )
    @arena-bot $400 @arena-top ( holds up to 1024 bytes )
  4. non revised this gist Jan 31, 2022. 1 changed file with 98 additions and 0 deletions.
    98 changes: 98 additions & 0 deletions test-regex.tal
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,98 @@
    %dbg { #ff #0e DEO }
    %sp { #20 #18 DEO }
    %nl { #0a #18 DEO }
    %exit { #01 #0f DEO BRK }

    |0100
    ;expr1 ;compile JSR2 dbg nl
    ;emit-stack JSR2 nl
    ;emit-arena JSR2 nl

    ;test1 OVR2k ;match JSR2 ;emit-byte JSR2 sp
    ;test2 OVR2k ;match JSR2 ;emit-byte JSR2 sp
    ;test3 OVR2k ;match JSR2 ;emit-byte JSR2 sp
    ;test4 OVR2k ;match JSR2 ;emit-byte JSR2 sp
    ;test5 OVR2k ;match JSR2 ;emit-byte JSR2 sp
    ;test6 OVR2k ;match JSR2 ;emit-byte JSR2 sp
    ;test7 OVR2k ;match JSR2 ;emit-byte JSR2 sp
    ;test8 OVR2k ;match JSR2 ;emit-byte JSR2 nl

    ;test1 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
    ;test2 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
    ;test3 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
    ;test4 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
    ;test5 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
    ;test6 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
    ;test7 ;graph1 ;match JSR2 ;emit-byte JSR2 sp
    ;test8 ;graph1 ;match JSR2 ;emit-byte JSR2 nl

    ;reset-arena JSR2
    exit

    ( corresponds to regex: a(b|c)d* )
    @expr1 "a(b|c)d* 00

    ( corresponds to regex: a(b|c)d* )
    ( accepts "ab" or "ac" followd by any number of d's )
    @graph1
    03 'a :x1
    @x1 04 :x2 :x3
    @x2 03 'b :x4
    @x3 03 'c :x4
    @x4 05 :x5 0000
    @x5 03 'd :x4

    ( test case strings to try matching )
    @test1 "ab 00 ( yes )
    @test2 "acdd 00 ( yes )
    @test3 "add 00 ( no )
    @test4 "abd 00 ( yes )
    @test5 "acddddddddddd 00 ( yes )
    @test6 "bd 00 ( no )
    @test7 "z 00 ( no )
    @test8 00 ( no )

    ~regex.tal

    @emit-short ( short* -- )
    SWP ;emit-byte JSR2 ;emit-byte JSR2 JMP2r

    @emit-byte ( byte^ -- )
    DUP #04 SFT ,&hex JSR #0f AND ,&hex JMP
    &hex #30 ADD DUP #39 GTH #27 MUL ADD emit
    JMP2r

    ( print stack size, followed by contents )
    @emit-stack ( -> )
    space LIT 'n emit LIT '= emit ;stack-pos LDA2 ;stack-bot SUB2 #0004 DIV2 ;emit-short JSR2 LIT ': emit
    ;stack-bot
    &loop
    DUP2 ;stack-pos LDA2 LTH2 ,&ok JCN
    POP2 newline JMP2r
    &ok
    space LDA2k ;emit-short JSR2
    #0002 ADD2 ,&loop JMP

    ( emit n bytes from the given address )
    @emit-n ( addr* count^ -> addr2* )
    DUP #00 GTH ( addr count count>0? ) ,&ok JCN ( addr count ) POP newline JMP2r
    &ok
    STH ( addr [count] ) space LDAk ;emit-byte JSR2 INC2 ( addr+1 [count] )
    STHr #01 SUB ( addr+1 count-1 )
    ;emit-n JMP2

    ( emit the arena, with one line per node )
    ( parses node type, since node size is dynamic (3-5). )
    @emit-arena ( -> )
    ;arena-bot
    &loop
    DUP2 ;arena-pos LDA2 LTH2 ,&ok JCN POP2 JMP2r
    &ok
    DUP2 ;emit-short JSR2
    LIT ': emit space
    LDAk #01 NEQ ,&!1 JCN #03 ;emit-n JSR2 ,&loop JMP
    &!1 LDAk #02 NEQ ,&!2 JCN #03 ;emit-n JSR2 ,&loop JMP
    &!2 LDAk #03 NEQ ,&!3 JCN #04 ;emit-n JSR2 ,&loop JMP
    &!3 LDAk #04 NEQ ,&!4 JCN #05 ;emit-n JSR2 ,&loop JMP
    &!4 LDAk #05 NEQ ,&!5 JCN #05 ;emit-n JSR2 ,&loop JMP
    &!5 ;unknown-node-type ;error! JSR2
  5. non created this gist Jan 31, 2022.
    494 changes: 494 additions & 0 deletions regex.tal
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,494 @@
    ( regex.tal )
    ( )
    ( compiles regex expression strings into regex nodes, then uses )
    ( regex ndoes to match input strings. )
    ( )
    ( this currently only supports matching an entire string, as )
    ( opposed to searching for a matching substring, or extracting )
    ( matching subgroups. )
    ( )
    ( regex node types: )
    ( )
    ( NAME DESCRIPTION STRUCT )
    ( empty matches empty string [ #01 next* ] )
    ( dot matches any one char [ #02 next* ] )
    ( lit matches one specific char (c) [ #03 c^ next* ] )
    ( or matches either left or right [ #04 left* right* ] )
    ( star matches expr zero-or-more times [ #05 r* next* ] )
    ( (NOTE: r.expr.next must be r) )
    ( )
    ( `or` and `star` have the same structure and are handled by the )
    ( same code (;do-or). however, the node types are kept different )
    ( to make it clearer how to parse and assemble the nodes. )
    ( )
    ( concatenation isn't a node, it is implied by the *next addr. )
    ( a next value of #0000 signals the end of the regex. )
    ( )
    ( in these docs str* is an address to a null-terminated string. )
    ( regexes should not include nulls and cannot match them (other )
    ( than the null which signals the end of a string). )

    %null? { #00 EQU }
    %debug { #ff #0e DEO }
    %emit { #18 DEO }
    %space { #20 emit }
    %newline { #0a emit }
    %quit! { #01 #0f DEO BRK }

    ( ERROR HANDLING )

    ( using error! will print the given message before causing )
    ( the interpreter to halt. )
    @error! ( msg* -> )
    LIT '! emit space
    &loop LDAk ,&continue JCN ,&done JMP
    &continue LDAk emit INC2 ,&loop JMP
    &done POP2 newline quit!

    ( error messages )
    @unknown-node-type "unknown 20 "node 20 "type 00
    @mismatched-parens "mismatched 20 "parenthesis 00
    @stack-is-full "stack 20 "is 20 "full 00
    @stack-is-empty "stack 20 "is 20 "empty 00
    @arena-is-full "arena 20 "is 20 "full 00
    @star-invariant "star 20 "invariant 20 "failed 00

    ( REGEX MATCHING )

    ( use stored regex to match against a stored string. )
    ( )
    ( regex* should be the address of a compiled regex )
    ( such as that returned from ;compile. )
    ( )
    ( str* should be a null-terminated string. )
    ( )
    ( returns true if the string, and false otherwise. )
    @match ( str* regex* -> bool^ )
    ;reset-stack JSR2
    ;loop JMP2

    ( loop used during matching )
    ( )
    ( we don't use the return stack here since that )
    ( complicates the back-tracking we need to do. )
    ( ultimately this code will issue a JMP2r to )
    ( return a boolean, which is where the stack )
    ( effects signature comes from. )
    @loop ( s* r* -> bool^ )
    LDAk #01 EQU ;do-empty JCN2
    LDAk #02 EQU ;do-dot JCN2
    LDAk #03 EQU ;do-literal JCN2
    LDAk #04 EQU ;do-or JCN2
    LDAk #05 EQU ;do-or JCN2 ( same code as the or case )
    ;unknown-node-type ;error! JSR2

    ( used when we hit a dead-end during matching. )
    ( )
    ( if stack is non-empty we have a point we can resume from. )
    @goto-backtrack ( -> bool^ )
    ;stack-exist JSR2 ,&has-stack JCN ( do we have stack? )
    #00 JMP2r ( no, return false )
    &has-stack ;pop4 JSR2 ;goto-next JMP2 ( yes, resume from the top )

    ( follow the given address (next*) to continue matching )
    @goto-next ( str* next* -> bool^ )
    DUP2 #0000 GTH2 ,&has-next JCN
    POP2 LDA null? ,&end-of-string JCN
    ;goto-backtrack JMP2
    &end-of-string #01 JMP2r
    &has-next ;loop JMP2

    ( handle the empty node -- just follow the next pointer )
    @do-empty ( str* regex* -> bool^ )
    INC2 LDA2 ( load next )
    ;goto-next JMP2 ( jump to next )

    ( handle dot -- match any one character )
    @do-dot ( str* regex* -> bool^ )
    INC2 LDA2 STH2 ( load and stash next )
    LDAk #00 NEQ ,&non-empty JCN ( is there a char? )
    POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
    &non-empty INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )

    ( handle literal -- match one specific character )
    @do-literal ( str* regex* -> bool^ )
    INC2
    LDAk STH ( store c )
    INC2 LDA2 STH2 ROTr ( store next, move c to top )
    LDAk
    STHr EQU ,&matches JCN ( do we match this char? )
    POP2r POP2 ;goto-backtrack JMP2 ( no, clear stacks and backtrack )
    &matches
    INC2 STH2r ;goto-next JMP2 ( yes, inc s, restore and jump )

    ( handle or -- try the left branch but backtrack to the right if needed )
    ( )
    ( this also handles asteration, since it ends up having the same structure )
    @do-or ( str* regex* -> bool^ )
    INC2 OVR2 OVR2 #0002 ADD2 ( s r+1 s r+3 )
    LDA2 ;push4 JSR2 ( save (s, right) in the stack for possible backtracking )
    LDA2 ;loop JMP2 ( continue on left branch )

    ( REGEX PARSING )

    ( track the position in the input string )
    @pos $2

    ( track how many levels deep we are in parenthesis )
    @parens $2

    ( read and increment pos )
    @read ( -> c^ )
    ;pos LDA2k ( pos s )
    LDAk STHk #00 EQU ( pos s c=0 [c] )
    ,&is-eof JCN ( pos s [c] )
    INC2 ( pos s+1 [c] )
    SWP2 STA2 ,&return JMP ( [c] )
    &is-eof POP2 POP2
    &return STHr ( c )
    JMP2r

    ( is pos currently pointing to a star? )
    @peek-to-star ( -> is-star^ )
    ;pos LDA2 LDA LIT '* EQU JMP2r

    ( just increment pos )
    @skip
    ;pos LDA2 INC2 ;pos STA2 JMP2r

    ( TODO: )
    ( 1. character groups: [] and [^] )
    ( 2. one-or-more: + )
    ( 3. zero-or-one: ? )
    ( 4. symbolic escapes, e.g. \n )

    ( STRETCH GOALS: )
    ( a. ^ and $ )
    ( b. counts: {n} and {m,n} )
    ( c. substring matching, i.e. searching )
    ( d. subgroup extraction )
    ( e. back-references, e.g \1 )

    ( compile an expression string into a regex graph )
    ( )
    ( the regex will be allocated in the arena; if there is not )
    ( sufficient space an error will be thrown. )
    ( )
    ( the stack will also be used during parsing although unlike )
    ( the arena it will be released once compilation ends. )
    @compile ( expr* -> regex* )
    ;pos STA2
    #0000 ;parens STA2
    ;reset-stack JSR2
    ;compile-region JMP2

    ( the basic strategy here is to build a stack of non-or )
    ( expressions to be joined together at the end of the )
    ( region. each stack entry has two regex addresses: )
    ( - the start of the regex )
    ( - the current tail of the regex )
    ( when we concatenate a new node to a regex we update )
    ( the second of these but not the first. )
    ( )
    ( the bottom of the stack for a given region is denoted )
    ( by #ffff #ffff. above that we start with #0000 #0000 )
    ( to signal an empty node. )
    @compile-region ( -> r2* )
    #ffff #ffff ;push4 JSR2 ( stack delimiter )
    #0000 #0000 ;push4 JSR2 ( stack frame start )
    @compile-region-loop
    ;read JSR2
    DUP #00 EQU ;c-done JCN2
    DUP LIT '| EQU ;c-or JCN2
    DUP LIT '. EQU ;c-dot JCN2
    DUP LIT '( EQU ;c-lpar JCN2
    DUP LIT ') EQU ;c-rpar JCN2
    DUP LIT '\ EQU ;c-esc JCN2
    DUP LIT '* EQU ;c-star JCN2
    ;c-char JMP2

    ( either finalize the given r0/r1 or else wrap it in )
    ( a star node if a star is coming up next. )
    ( )
    ( we use this look-ahead approach rather than compiling )
    ( star nodes directly since the implementation is simpler. )
    @c-peek-and-finalize ( r0* r1* -> r2* )
    ;peek-to-star JSR2 ( r0 r1 next-is-star? )
    ,&next-is-star JCN ,&finally JMP ( r0 r1 )
    &next-is-star ;skip JSR2 POP2 ;alloc-star JSR2 DUP2 ( star )
    &finally ;push-next JSR2 ;compile-region-loop JMP2

    ( called when we reach EOF of the input string )
    ( )
    ( as with c-rpar we have to unroll the current level )
    ( of the stack, building any or-nodes that are needed. )
    ( )
    ( this is where we detect unclosed parenthesis. )
    @c-done ( c^ -> r2* )
    POP
    ;parens LDA2 #0000 GTH2 ,&mismatched-parens JCN
    ;unroll-stack JSR2 POP2 JMP2r
    &mismatched-parens ;mismatched-parens ;error! JSR2

    ( called when we read "|" )
    ( )
    ( since we defer building or-nodes until the end of the region )
    ( we just start a new stack frame and continue. )
    @c-or ( c^ -> r2* )
    POP
    #0000 #0000 ;push4 JSR2
    ;compile-region-loop JMP2

    ( called when we read "(" )
    ( )
    ( this causes us to: )
    ( )
    ( 1. increment parens )
    ( 2. start a new region on the stack )
    ( 3. jump to compile-region to start parsing the new region )
    @c-lpar ( c^ -> r2* )
    POP
    ;parens LDA2 INC2 ;parens STA2 ( parens++ )
    ;compile-region JMP2

    ( called when we read ")" )
    ( )
    ( this causes us to: )
    ( )
    ( 1. check for mismatched parens )
    ( 2. decrement parens )
    ( 3. unroll the current region on the stack into one regex node )
    ( 4. finalize that node and append it to the previous region )
    ( 5. continue parsing )
    @c-rpar ( c^ -> r2* )
    POP
    ;parens LDA2 #0000 EQU2 ,&mismatched-parens JCN
    ;parens LDA2 #0001 SUB2 ;parens STA2 ( parens-- )
    ;unroll-stack JSR2
    ;c-peek-and-finalize JMP2
    &mismatched-parens ;mismatched-parens ;error! JSR2

    ( called when we read "." )
    ( )
    ( allocates a dot-node and continues. )
    @c-dot ( c^ -> r2* )
    POP
    ;alloc-dot JSR2 ( dot )
    DUP2 ;c-peek-and-finalize JMP2

    ( TODO: escaping rules not quite right )

    ( called when we read "\" )
    ( )
    ( allocates a literal of the next character. )
    ( )
    ( this doesn't currently handle any special escape sequences. )
    @c-esc ( c^ -> r2* )
    POP
    ;read JSR2
    ;c-char JMP2

    ( called when we read any other character )
    ( )
    ( allocates a literal-node and continues. )
    @c-char ( c^ -> r2* )
    ;alloc-lit JSR2 ( lit )
    DUP2 ;c-peek-and-finalize JMP2

    ( called if we parse a "*" )
    ( )
    ( actually calling this means the code broke an invariant somewhere. )
    @c-star ( c^ -> regex* )
    POP
    ;star-invariant ;error! JSR2

    ( ALLOCATING REGEX NDOES )

    @alloc3 ( mode^ -> r* )
    #0000 ROT ( 00 00 mode^ )
    #03 ;alloc JSR2 ( 00 00 mode^ addr* )
    STH2k STA ( addr <- mode )
    STH2kr INC2 STA2 ( addr+1 <- 0000 )
    STH2r JMP2r ( return addr )

    @alloc-empty ( -> r* )
    #01 ;alloc3 JMP2

    @alloc-dot ( -> r* )
    #02 ;alloc3 JMP2

    @alloc-lit ( c^ -> r* )
    #03 #0000 SWP2 ( 0000 c^ 03 )
    #04 ;alloc JSR2 ( 0000 c^ 03 addr* )
    STH2k STA ( addr <- 03 )
    STH2kr INC2 STA ( addr+1 <- c )
    STH2kr #0002 ADD2 STA2 ( addr+2 <- 0000 )
    STH2r JMP2r ( return addr )

    @alloc-or ( right* left* -> r* )
    #05 ;alloc JSR2 STH2 ( r l [x] )
    #04 STH2kr STA ( r l [x] )
    STH2kr INC2 STA2 ( r [x] )
    STH2kr #0003 ADD2 STA2 ( [x] )
    STH2r JMP2r

    @alloc-star ( expr* -> r* )
    #05 ;alloc JSR2 STH2 ( expr [r] )
    #05 STH2kr STA ( expr [r] )
    DUP2 STH2kr INC2 STA2 ( expr [r] )
    #0000 STH2kr #0003 ADD2 STA2 ( expr [r] )
    STH2kr SWP2 ( r expr [r] )
    ;set-next JSR2 ( [r] )
    STH2r JMP2r

    ( unroll one region of the parsing stack, returning )
    ( a single node consisting of an alternation of )
    ( all elements on the stack. )
    ( )
    ( this unrolls until it hits #ffff #ffff, which it )
    ( also removes from the stack. )
    @unroll-stack ( -> start* end* )
    ;pop4 JSR2 STH2 ( r )
    #00 STH ( count items in stack frame )
    DUP2 #0000 NEQ2 ,&loop JCN ;alloc-empty JSR2
    &loop ( r* )
    ;pop4 JSR2 POP2 ( r x )
    DUP2 #ffff EQU2 ( r x x-is-end? ) ,&done JCN
    INCr ( items++ )
    ;alloc-or JSR2 ( r|x ) ,&loop JMP
    &done
    ( r ffff )
    POP2
    STHr ,&is-or JCN
    STH2r JMP2r
    &is-or
    POP2r
    ;alloc-empty JSR2 OVR2 OVR2 SWP2 ( r empty empty r )
    ;set-next JSR2
    JMP2r

    ( add r to the top of the stock. )
    ( )
    ( in particular, this will write r into tail.next )
    ( before replacing tail with r. )
    @push-next ( r0 r1 -> )
    ;pop4 JSR2 ( r0 r1 x0 x1 )
    DUP2 #0000 EQU2 ( r0 r1 x0 x1 x1=0? ) ,&is-zero JCN
    STH2 ROT2 STH2r ( r1 x0 r0 x1 )
    ;set-next JSR2 SWP2 ( x0 r1 )
    ;push4 JSR2
    JMP2r
    &is-zero POP2 POP2 ;push4 JSR2 JMP2r

    ( load the given address: )
    ( )
    ( 1. if it points to 0000, update it to target )
    ( 2. otherwise, call set-next on it )
    @set-next-addr ( target* addr* -> )
    LDA2k #0000 EQU2 ( target addr addr=0? ) ,&is-zero JCN
    LDA2 ;set-next JSR2 JMP2r
    &is-zero STA2 JMP2r

    ( set regex.next to target )
    @set-next ( target* regex* -> )
    LDAk #01 NEQ ,&!1 JCN INC2 ;set-next-addr JSR2 JMP2r
    &!1 LDAk #02 NEQ ,&!2 JCN INC2 ;set-next-addr JSR2 JMP2r
    &!2 LDAk #03 NEQ ,&!3 JCN #0002 ADD2 ;set-next-addr JSR2 JMP2r
    &!3 LDAk #04 NEQ ,&!4 JCN
    ( todo: this is probably broken )
    OVR2 OVR2 INC2 ;set-next-addr JSR2
    #0003 ADD2 ;set-next-addr JSR2 JMP2r
    &!4 LDAk #05 NEQ ,&!5 JCN #0003 ADD2 ;set-next-addr JSR2 JMP2r
    &!5 ;unknown-node-type ;error! JSR2

    ( STACK OPERATIONS )
    ( )
    ( we always push/pop 4 bytes at a time. the stack has a fixed )
    ( maximum size it can use, defined by ;stack-top. )
    ( )
    ( the stack can be cleared using ;reset-stack, which resets )
    ( the stack pointers but does not zero out any memory. )
    ( )
    ( stack size is 4096 bytes here but is configurable. )
    ( in some cases it could be very small but this will limit )
    ( how many branches can be parsed and executed. )

    ( push 4 bytes onto the stack )
    @push4 ( str* regex* -> )
    ;assert-stack-avail JSR2 ( check for space )
    ;stack-pos LDA2 #0002 ADD2 STA2 ( cell[2:3] <- regex )
    ;stack-pos LDA2 STA2 ( cell[0:1] <- str )
    ;stack-pos LDA2 #0004 ADD2 ;stack-pos STA2 ( pos += 4 )
    JMP2r

    ( pop 4 bytes from the stack )
    @pop4 ( -> str* regex* )
    ;assert-stack-exist JSR2 ( check for space )
    ;stack-pos LDA2 ( load stack-pos )
    #0002 SUB2 LDA2k STH2 ( pop and stash regex )
    #0002 SUB2 LDA2k STH2 ( pop and stash str )
    ;stack-pos STA2 ( save new stack-pos )
    STH2r STH2r ( restore str and regex )
    JMP2r

    ( reset stack pointers )
    @reset-stack ( -> )
    ;stack-bot ;stack-pos STA2 JMP2r ( pos <- 0 )

    ( can more stack be allocated? )
    @stack-avail ( -> bool^ )
    ;stack-pos LDA2 ;stack-top LTH2 JMP2r

    ( is the stack non-empty? )
    @stack-exist ( -> bool^ )
    ;stack-pos LDA2 ;stack-bot GTH2 JMP2r

    ( error if stack is full )
    @assert-stack-avail ( -> )
    ;stack-avail JSR2 ,&ok JCN ;stack-is-full ;error! JSR2 &ok JMP2r

    ( error is stack is empty )
    @assert-stack-exist ( -> )
    ;stack-exist JSR2 ,&ok JCN ;stack-is-empty ;error! JSR2 &ok JMP2r

    ( stack-pos points to the next free stack position (or the top if full). )
    @stack-pos :stack-bot ( the next position to insert at )

    ( stack-bot is the address of the first stack position. )
    ( stack-top is the address of the first byte beyond the stack. )
    @stack-bot $1000 @stack-top ( holds 1024 steps (4096 bytes) )

    ( ARENA OPERATIONS )
    ( )
    ( the arena represents a heap of memory that can easily be )
    ( allocated in small amounts. )
    ( )
    ( the entire arena can be reclaimed using ;reset-arena, but )
    ( unlike systems such as malloc/free, the arena cannot relcaim )
    ( smaller amounts of memory. )
    ( )
    ( the arena is used to allocate regex graph nodes, which are )
    ( dynamically-allocated as the regex string is parsed. once )
    ( a regex is no longer needed the arena may be reclaimed. )
    ( )
    ( arena size is 1024 bytes here but is configurable. )
    ( smaller sizes would likely be fine but will limit the )
    ( overall complexity of regexes to be parsed and executed. )

    ( reclaim all the memory used by the arena )
    @reset-arena ( -> )
    ;arena-bot ;arena-pos STA2 JMP2r

    ( currently caller is responsible for zeroing out memory if needed )
    @alloc ( size^ -> addr* )
    #00 SWP ( size* )
    ;arena-pos LDA2 STH2k ADD2 ( pos+size* [pos] )
    DUP2 ;arena-top GTH2 ( pos+size pos+size>top? [pos] )
    ,&error JCN ( pos+size [pos] )
    ;arena-pos STA2 ( pos += size [pos] )
    STH2r JMP2r ( pos )
    &error POP2 POP2r ;arena-is-full ;error! JSR2

    |1ffe
    @arena-pos :arena-bot ( the next position to allocate )
    @arena-bot $400 @arena-top ( holds up to 1024 bytes )