Skip to content

Instantly share code, notes, and snippets.

@JordanDelcros
Created November 6, 2015 19:36
Show Gist options
  • Select an option

  • Save JordanDelcros/30b4421e47b1eb18ad13 to your computer and use it in GitHub Desktop.

Select an option

Save JordanDelcros/30b4421e47b1eb18ad13 to your computer and use it in GitHub Desktop.

Revisions

  1. Jordan Delcros created this gist Nov 6, 2015.
    198 changes: 198 additions & 0 deletions parseHTML.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,198 @@
    var parser = function( html ){

    html = html
    .replace(/&(lt|nbsp|amp)/gim, "&$1")
    .replace(//gi, "·")
    .replace(/#/gi, "&#35")
    .replace(/</gim, "&lt;")
    .replace(/>/gim, "&gt;")
    .replace(/(&lt;(?!\!)\/?(?:\w+)((?:\s*?[\w-]+(?:\s*=\s*)?(?:\"[^\"]*(?:\")?|\'[^\']*(?:\')?)?)*)?(?:\s*\/?\s*&gt;|[\s\S]*$)?)/gi, function( matchTag ){

    var tag = matchTag.replace(/(?:&lt;\/?\w+)?(\s*[\w-]+(?:=?\"[^\"]*(?:\")?|=\'[^\']*(?:\')?)?)?(?:\/?&gt;|&lt;)?/gi, function( global, matchAttribute ){

    var returned = global;

    if( matchAttribute ){

    if( /^\s*[\w-]+\s*$/.test(matchAttribute) ){

    returned = global.replace(matchAttribute, "##/attribute/####/name/##" + matchAttribute + "##\\name\\####\\attribute\\##");

    }
    else if( /^\s*[\w-]+=/.test(matchAttribute) ){

    var attribute = matchAttribute.split(/(=)([\s\S]*)/);

    attribute[0] = "##/name/##" + attribute[0] + "##\\name\\##";
    attribute[2] = "##/value/##" + attribute[2] + "##\\value\\##";

    returned = global.replace(matchAttribute, "##/attribute/##" + attribute.join("") + "##\\attribute\\##");

    }
    else if( /^\s*[\w-]+[\"\']/.test(matchAttribute) ){

    var attribute = matchAttribute.split(/([\"\'][\s\S]+)/);

    attribute[0] = "##/name/##" + attribute[0] + "##\\name\\##";
    attribute[1] = "##/value/##" + (attribute[1] || "") + "##\\value\\##";

    returned = global.replace(matchAttribute, "##/attribute/##" + attribute.join("") + "##\\attribute\\##");

    };

    };

    return returned;

    });

    return '##/tag/##' + tag.replace(/(^&lt;\/?|\/?&gt;$)/gi, '##/bracket/##$1##\\bracket\\##') + '##\\tag\\##';

    })
    .replace(/(&lt;\s|&lt;$)/g, '##/error/##$1##\\error\\##')
    .replace(/(&lt;)(?=[\s\n]+)/g, '##/error/##$1##\\error\\##')
    .replace(/(&lt;\!--[\s\S]*?(?:--&gt;|$))/gi, function( global ){

    return global.replace(/(&lt;\!--)/g, "##/comment/##$1");

    })
    .replace(/(&lt;\!\[CDATA\[[\s\S]*?(?:\]\]&gt;|$))/gi, function( global ){

    return "##/cdata/##" + global.replace(/##(\/|\\)\w+\1##/g, "") + "##\\cdata\\##";

    })
    .replace(/(##\/comment\/##[\s\S]*?(?:--&gt;|$))/g, function( global ){

    return "##/comment/##" + global.replace(/##(\/|\\)\w+\1##/g, "") + "##\\comment\\##";

    })
    .replace(/(##\/value\/##[\s\S]*?(?:##\\value\\##|$))/gi, function( global ){

    return "##/value/##" + global.replace(/##(\/|\\)\w+\1##/g, "") + "##\\value\\##";

    })
    .replace(/(\t)/gim, '##/tabulation/##$1##\\tabulation\\##')
    .split(/\n/g);

    var unopened;
    var unclosed;
    for( var line = 0; line < html.length; line++ ){

    var opened = (html[line].match(/##\/\w+\/##/g) || []);
    var closed = (html[line].match(/##\\\w+\\##/g) || []);

    if( unclosed && unclosed.length > 0 ){

    var tempUnclosed = unclosed.slice(0);

    for( var toOpened = 0; toOpened < tempUnclosed.length; toOpened++ ){

    var inverted = tempUnclosed[toOpened].replace(/\\/g, "/");

    html[line] = inverted + html[line];
    opened.unshift(inverted);

    if( new RegExp(tempUnclosed[toOpened].replace(/\\/g, "\\\\")).test(html[line]) /*&& open.length === close.length*/ ){

    unclosed.shift();

    }
    else {

    html[line] += tempUnclosed[toOpened];
    closed.push(tempUnclosed[toOpened]);

    };

    };

    var openedBuffer = opened.slice(0);
    var closedBuffer = closed.slice(0);

    for( var type = 0; type < opened.length; type++ ){

    var inverted = opened[type].replace(/\//g, "\\");

    var index = closedBuffer.indexOf(inverted);

    if( index === -1 ){

    unclosed.push(inverted);

    }
    else {

    closedBuffer.splice(index, 1);

    };

    };

    }
    else {

    unopened = new Array();
    unclosed = new Array();

    var openedBuffer = opened.slice(0);
    var closedBuffer = closed.slice(0);

    if( opened.length > closed.length ){

    for( var type = 0; type < opened.length; type++ ){

    var inverted = opened[type].replace(/\//g, "\\");

    var index = closedBuffer.indexOf(inverted);

    if( index === -1 ){

    unclosed.push(inverted);

    }
    else {

    closedBuffer.splice(index, 1);

    };

    };

    }
    else if( closed.length > opened.length ){

    for( var type = 0; type < closed.length; type++ ){

    var inverted = closed[type].replace(/\\/g, "/");

    var index = openedBuffer.indexOf(inverted);

    if( index === -1 ){

    unopened.push(inverted);

    }
    else {

    openedBuffer.splice(index, 1);

    };

    };

    };

    html[line] = unopened.join("") + html[line] + unclosed.reverse().join("");

    };

    html[line] = "<pre>" + html[line] + "</pre>";

    };

    return html
    .join("")
    .replace(/##\/(\w+)\/##/g, '<span class="$1">')
    .replace(/##\\\w+\\##/g, '</span>')

    };