Skip to content

Instantly share code, notes, and snippets.

@dzwarg
Last active July 1, 2017 13:51
Show Gist options
  • Save dzwarg/aea276a6ab5bc5d4ab052fc1434b7020 to your computer and use it in GitHub Desktop.
Save dzwarg/aea276a6ab5bc5d4ab052fc1434b7020 to your computer and use it in GitHub Desktop.

Revisions

  1. dzwarg revised this gist Jul 1, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion package.json
    Original file line number Diff line number Diff line change
    @@ -5,7 +5,7 @@
    "main": "crawl.js",
    "repository": {
    "type": "git",
    "url": "https://gist.github.com/dzwarg/TBD.git"
    "url": "https://gist.github.com/dzwarg/aea276a6ab5bc5d4ab052fc1434b7020.git"
    },
    "author": "David Zwarg <[email protected]>",
    "license": "MIT"
  2. dzwarg created this gist Jul 1, 2017.
    135 changes: 135 additions & 0 deletions crawl.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,135 @@
    #!/usr/bin/env node
    var Spider = require('node-spider'),
    fs = require('fs'),
    host = process.argv.length == 1 ? process.argv[0] : '',
    filesToSave = -1,
    start = Date.now();

    // configure the spider
    var spider = new Spider({
    concurrent:1,
    delay:0,
    logs: process.stderr,
    allowDuplicates:false,
    catchErrors:true,
    error:function(err,url){
    console.log('ERROR!');
    console.log(err);
    console.log(url);
    },
    done:function(){
    console.log('DONE:' + (-filesToSave));
    console.log(((Date.now() - start)/1000).toFixed(2) + 's');
    },
    headers:{
    'accept-encoding': 'identity'
    },
    encoding: null
    });

    // make a directory tree, based on a path
    var mktree = function(prefix, path, done) {
    if (path.length == 0) {
    return done(null);
    }

    var parts = path.split('/'),
    _prefix = [prefix, parts[0]].join('/');

    fs.stat(_prefix, function(err, stats){
    if (err) {
    fs.mkdir(_prefix, function(err) {
    if (err) return done(err);

    mktree(_prefix, parts.slice(1).join('/'), done);
    });
    } else if (stats.isDirectory()) {
    mktree(_prefix, parts.slice(1).join('/'), done);
    }
    });
    };

    // do some filtering of content here
    var styleScript = function(content) {
    return content
    .toString('utf8');
    }

    // save a document/image/etc
    var save = function(doc) {
    var full = decodeURIComponent(doc.res.request.uri.pathname.substr(1)),
    ct = doc.res.headers['content-type'],
    body = doc.res.body;
    if (full == '' || full == 'index.php/') {
    full = 'index.html';
    body = styleScript(body);
    } else if (ct.startsWith('text/html')) {
    full += '/index.html';
    body = styleScript(body);
    } else if (ct.startsWith('text/css')) {
    full += '.css';
    } else if (ct.startsWith('application/javascript')) {
    full += '.js';
    }
    var path = full.substr(0, full.lastIndexOf('/'));
    var writeFile = function(full, content) {
    fs.writeFile(full, content, 'binary', function(err){
    if (err) throw err;

    console.log('Saved file: ' + full);
    });
    };
    mktree('.', path, function(err){
    if (err) throw err;

    if (filesToSave-- == 0)
    process.exit(0);
    else
    writeFile(full, body);
    });
    };

    // queue up all found 'href' attributes
    var enqueueHrefs = function(doc) {
    doc.\$('[href]').each(function(i,e){
    var href=doc.\$(this).attr('href');
    if (!href) return;
    href=href.split('#')[0];
    var url=doc.resolve(href);
    if (!url.startsWith('http://' + host)) return;
    spider.queue(url,req);
    });
    };

    // queue up all found 'src' attributes
    var enqueueImages = function(doc) {
    doc.\$('[src]').each(function(i,e){
    var src=doc.\$(this).attr('src');
    if (!src) return;
    var url=doc.resolve(src);
    if (!url.startsWith('http://' + host)) return;
    spider.queue(url,req);
    });
    };

    // issue a request
    var req = function(doc){
    var ct = doc.res.headers['content-type'];
    if (ct.startsWith('text/')) {
    save(doc);

    if (ct.startsWith('text/html')) {
    enqueueHrefs(doc);
    enqueueImages(doc);
    }
    } else if (ct.startsWith('image/')) {
    save(doc);
    } else if (ct.startsWith('application/javascript')) {
    save(doc);
    } else {
    console.log('Skipping: ' + doc.url);
    }
    };

    // queue up the index page if the hostname was provided
    if (host != '') spider.queue('http://' + host + '/', req);
    12 changes: 12 additions & 0 deletions package.json
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,12 @@
    {
    "name": "simple-crawl",
    "version": "1.0.0",
    "description": "A simple web crawler that saves HTML, IMG, CSS, and JavaScript sources.",
    "main": "crawl.js",
    "repository": {
    "type": "git",
    "url": "https://gist.github.com/dzwarg/TBD.git"
    },
    "author": "David Zwarg <[email protected]>",
    "license": "MIT"
    }