dzwarg · July 1, 2017 13:51 · Jul 1, 2017 · Jul 1, 2017
diff --git a/package.json b/package.json
@@ -5,7 +5,7 @@
   "main": "crawl.js",
   "repository": {
     "type": "git",
-    "url": "https://gist.github.com/dzwarg/TBD.git"
+    "url": "https://gist.github.com/dzwarg/aea276a6ab5bc5d4ab052fc1434b7020.git"
   },
   "author": "David Zwarg <[email protected]>",
   "license": "MIT"

diff --git a/crawl.js b/crawl.js
@@ -0,0 +1,135 @@
+#!/usr/bin/env node
+var Spider = require('node-spider'),
+  fs = require('fs'),
+  host = process.argv.length == 1 ? process.argv[0] : '',
+  filesToSave = -1,
+  start = Date.now();
+
+// configure the spider
+var spider = new Spider({
+  concurrent:1,
+  delay:0,
+  logs: process.stderr,
+  allowDuplicates:false,
+  catchErrors:true,
+  error:function(err,url){
+    console.log('ERROR!');
+    console.log(err);
+    console.log(url);
+  },  
+  done:function(){
+    console.log('DONE:' + (-filesToSave));
+    console.log(((Date.now() - start)/1000).toFixed(2) + 's');
+  },  
+  headers:{
+    'accept-encoding': 'identity'
+  },
+  encoding: null
+});
+
+// make a directory tree, based on a path
+var mktree = function(prefix, path, done) {
+  if (path.length == 0) {
+    return done(null);
+  }
+
+  var parts = path.split('/'),
+    _prefix = [prefix, parts[0]].join('/');
+
+  fs.stat(_prefix, function(err, stats){
+    if (err) {
+      fs.mkdir(_prefix, function(err) {
+        if (err) return done(err);
+
+        mktree(_prefix, parts.slice(1).join('/'), done);
+      }); 
+    } else if (stats.isDirectory()) {
+      mktree(_prefix, parts.slice(1).join('/'), done);
+    }   
+  }); 
+};
+
+// do some filtering of content here
+var styleScript = function(content) {
+  return content
+    .toString('utf8');
+}
+
+// save a document/image/etc
+var save = function(doc) {
+  var full = decodeURIComponent(doc.res.request.uri.pathname.substr(1)),
+    ct = doc.res.headers['content-type'],
+    body = doc.res.body;
+  if (full == '' || full == 'index.php/') {
+    full = 'index.html';
+    body = styleScript(body);
+  } else if (ct.startsWith('text/html')) {
+    full += '/index.html';
+    body = styleScript(body);
+  } else if (ct.startsWith('text/css')) {
+    full += '.css';
+  } else if (ct.startsWith('application/javascript')) {
+    full += '.js';
+  }
+  var path = full.substr(0, full.lastIndexOf('/'));
+  var writeFile = function(full, content) {
+    fs.writeFile(full, content, 'binary', function(err){
+      if (err) throw err;
+
+      console.log('Saved file: ' + full);
+    });
+  };
+  mktree('.', path, function(err){
+    if (err) throw err;
+
+    if (filesToSave-- == 0)
+      process.exit(0);
+    else
+      writeFile(full, body);
+  });
+};
+
+// queue up all found 'href' attributes
+var enqueueHrefs = function(doc) {
+  doc.\$('[href]').each(function(i,e){
+    var href=doc.\$(this).attr('href');
+    if (!href) return;
+    href=href.split('#')[0];
+    var url=doc.resolve(href);
+    if (!url.startsWith('http://' + host)) return;
+    spider.queue(url,req);
+  });
+};
+
+// queue up all found 'src' attributes
+var enqueueImages = function(doc) {
+  doc.\$('[src]').each(function(i,e){
+    var src=doc.\$(this).attr('src');
+    if (!src) return;
+    var url=doc.resolve(src);
+    if (!url.startsWith('http://' + host)) return;
+    spider.queue(url,req);
+  });
+};
+
+// issue a request
+var req = function(doc){
+  var ct = doc.res.headers['content-type'];
+  if (ct.startsWith('text/')) {
+    save(doc);
+
+    if (ct.startsWith('text/html')) {
+      enqueueHrefs(doc);
+      enqueueImages(doc);
+    }
+  } else if (ct.startsWith('image/')) {
+    save(doc);
+  } else if (ct.startsWith('application/javascript')) {
+    save(doc);
+  } else {
+    console.log('Skipping: ' + doc.url);
+  }
+};
+
+// queue up the index page if the hostname was provided
+if (host != '') spider.queue('http://' + host + '/', req);
diff --git a/package.json b/package.json
@@ -0,0 +1,12 @@
+{
+  "name": "simple-crawl",
+  "version": "1.0.0",
+  "description": "A simple web crawler that saves HTML, IMG, CSS, and JavaScript sources.",
+  "main": "crawl.js",
+  "repository": {
+    "type": "git",
+    "url": "https://gist.github.com/dzwarg/TBD.git"
+  },
+  "author": "David Zwarg <[email protected]>",
+  "license": "MIT"
+}