Skip to content

Instantly share code, notes, and snippets.

@borgar
Created June 24, 2010 12:33
Show Gist options
  • Save borgar/451393 to your computer and use it in GitHub Desktop.
Save borgar/451393 to your computer and use it in GitHub Desktop.

Revisions

  1. borgar revised this gist Jun 24, 2010. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion Tiny JavaScript tokenizer.js
    Original file line number Diff line number Diff line change
    @@ -9,7 +9,7 @@
    *
    */
    function tokenize ( s, parsers, deftok ) {
    var m, r, l, cnt, t, tokens = [];
    var m, r, l, t, tokens = [];
    while ( s ) {
    t = null;
    m = s.length;
  2. borgar renamed this gist Jun 24, 2010. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. borgar created this gist Jun 24, 2010.
    44 changes: 44 additions & 0 deletions Tiny JavaScript tokenizer
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,44 @@
    /*
    * Tiny tokenizer
    *
    * - Accepts a subject string and an object of regular expressions for parsing
    * - Returns an array of token objects
    *
    * tokenize('this is text.', { word:/\w+/, whitespace:/\s+/, punctuation:/[^\w\s]/ }, 'invalid');
    * result => [{ token="this", type="word" },{ token=" ", type="whitespace" }, Object { token="is", type="word" }, ... ]
    *
    */
    function tokenize ( s, parsers, deftok ) {
    var m, r, l, cnt, t, tokens = [];
    while ( s ) {
    t = null;
    m = s.length;
    for ( var key in parsers ) {
    r = parsers[ key ].exec( s );
    // try to choose the best match if there are several
    // where "best" is the closest to the current starting point
    if ( r && ( r.index < m ) ) {
    t = {
    token: r[ 0 ],
    type: key,
    matches: r.slice( 1 )
    }
    m = r.index;
    }
    }
    if ( m ) {
    // there is text between last token and currently
    // matched token - push that out as default or "unknown"
    tokens.push({
    token : s.substr( 0, m ),
    type : deftok || 'unknown'
    });
    }
    if ( t ) {
    // push current token onto sequence
    tokens.push( t );
    }
    s = s.substr( m + (t ? t.token.length : 0) );
    }
    return tokens;
    }